Instructions to use Maaac/CodeLLaMA-Linux-BugFix with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use Maaac/CodeLLaMA-Linux-BugFix with PEFT:
Task type is invalid.
- Notebooks
- Google Colab
- Kaggle
| from pydriller import Repository | |
| import os | |
| import json | |
| from tqdm import tqdm | |
| import re | |
| from multiprocessing import Pool | |
| REPO_PATH = '../linux' | |
| OUTPUT_FILE = './output/linux_bugfix_dataset.jsonl' | |
| TEST_MODE = False # Set to False to process the full repository | |
| MAX_COMMITS_TEST = 50 # Set a limit if TEST_MODE is True | |
| NUM_WORKERS = 16 # Adjust to your actual core count | |
| BUGFIX_KEYWORDS = [ | |
| 'fix', 'bug', 'leak', 'null', 'overflow', 'error', 'failure', | |
| 'crash', 'panic', 'memory', 'race', 'deadlock', 'corruption', | |
| 'security', 'vulnerability', 'exploit', 'buffer', 'stack' | |
| ] | |
| def is_bugfix_commit(msg): | |
| msg_lower = msg.lower() | |
| return any(keyword in msg_lower for keyword in BUGFIX_KEYWORDS) | |
| def extract_instruction_from_commit_msg(msg): | |
| lines = msg.strip().splitlines() | |
| for line in lines: | |
| line = line.strip() | |
| if len(line) < 5 or not any(c.isalpha() for c in line): | |
| continue | |
| if line.lower().startswith(( | |
| '[patch]', 'signed-off-by', 'reviewed-by', 'tested-by', 'ack', | |
| 'reported-by', 'cc:', 'co-authored-by', 'patchwork-id', | |
| 'suggested-by', 'fixes:', 'link:', 'cherry picked from commit' | |
| )): | |
| continue | |
| return line | |
| return msg.strip().splitlines()[0] if msg.strip() else "fix" | |
| def extract_code_context(code, line_number, context_lines=10): | |
| if not code: | |
| return "" | |
| lines = code.split('\n') | |
| start = max(0, line_number - context_lines) | |
| end = min(len(lines), line_number + context_lines) | |
| return '\n'.join(lines[start:end]) | |
| def extract_diff_context(diff_text, context_lines=5): | |
| if not diff_text: | |
| return "" | |
| lines = diff_text.split('\n') | |
| change_lines = [i for i, line in enumerate(lines) if line.startswith('+') or line.startswith('-')] | |
| if not change_lines: | |
| return diff_text | |
| start = max(0, change_lines[0] - context_lines) | |
| end = min(len(lines), change_lines[-1] + context_lines + 1) | |
| return '\n'.join(lines[start:end]) | |
| def create_dataset_entry(original_code, commit_msg, diff_code): | |
| return { | |
| "input": { | |
| "original code": original_code.strip(), | |
| "instruction": extract_instruction_from_commit_msg(commit_msg) | |
| }, | |
| "output": { | |
| "diff codes": diff_code.strip() | |
| } | |
| } | |
| def process_commit(commit): | |
| entries = [] | |
| if not is_bugfix_commit(commit.msg): | |
| return entries | |
| for mod in commit.modified_files: | |
| if not mod.new_path or not mod.new_path.endswith(('.c', '.h')): | |
| continue | |
| if mod.change_type.name != "MODIFY": | |
| continue | |
| if not mod.diff or not mod.source_code_before: | |
| continue | |
| focused_diff = extract_diff_context(mod.diff) | |
| diff_lines = mod.diff.split('\n') | |
| line_numbers = [] | |
| for line in diff_lines: | |
| if line.startswith('@@'): | |
| match = re.search(r'@@ -(\d+),?\d* \+\d+,?\d* @@', line) | |
| if match: | |
| line_numbers.append(int(match.group(1))) | |
| if line_numbers: | |
| focused_code = extract_code_context(mod.source_code_before, line_numbers[0]) | |
| else: | |
| focused_code = '\n'.join(mod.source_code_before.split('\n')[:50]) | |
| entry = create_dataset_entry( | |
| original_code=focused_code, | |
| commit_msg=commit.msg, | |
| diff_code=focused_diff | |
| ) | |
| entries.append(entry) | |
| return entries | |
| def collect_entries_from_hash(commit_hash): | |
| try: | |
| commit = next(Repository(REPO_PATH, only_commits=[commit_hash]).traverse_commits()) | |
| return process_commit(commit) | |
| except Exception: | |
| return [] | |
| def main(): | |
| if not os.path.exists(REPO_PATH): | |
| print("[ERROR] Repository not found at:", REPO_PATH) | |
| return | |
| os.makedirs('./output', exist_ok=True) | |
| print("[INFO] Building Linux kernel bug-fix dataset...") | |
| print("[INFO] Repository:", REPO_PATH) | |
| print("[INFO] Output file:", OUTPUT_FILE) | |
| output_file = OUTPUT_FILE.replace('.jsonl', '_test.jsonl') if TEST_MODE else OUTPUT_FILE | |
| all_hashes = [c.hash for c in Repository(REPO_PATH).traverse_commits()] | |
| if TEST_MODE and MAX_COMMITS_TEST: | |
| all_hashes = all_hashes[:MAX_COMMITS_TEST] | |
| dataset_entries = [] | |
| with Pool(NUM_WORKERS) as pool: | |
| results = list(tqdm(pool.imap_unordered(collect_entries_from_hash, all_hashes), total=len(all_hashes))) | |
| for entries in results: | |
| dataset_entries.extend(entries) | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| for entry in dataset_entries: | |
| f.write(json.dumps(entry, ensure_ascii=False) + '\n') | |
| print("[DONE] Dataset creation completed!") | |
| print("[INFO] Total commits processed:", len(all_hashes)) | |
| print("[INFO] Total dataset entries:", len(dataset_entries)) | |
| print("[INFO] Saved to:", output_file) | |
| if dataset_entries: | |
| print("[INFO] Sample dataset entry:") | |
| sample = dataset_entries[0] | |
| print(json.dumps(sample, indent=2, ensure_ascii=False)[:800] + "...") | |
| print("[INFO] Dataset structure:") | |
| print(" - Input: original code + instruction") | |
| print(" - Output: diff codes") | |
| print(" - Format: JSONL (one JSON object per line)") | |
| if __name__ == "__main__": | |
| main() | |