-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Rust: regenerate MaD files using DCA #19674
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
31d1604
900a3b0
d5c16d6
31954fa
fbd5058
4f47ee2
ee7eb86
530b990
f4bbef9
ec77eb3
6162cf5
e1eb1f6
d6d13b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
language: cpp | ||
strategy: dca | ||
destination: cpp/ql/lib/ext/generated | ||
targets: | ||
- name: openssl | ||
with-sinks: false | ||
with-sources: false | ||
- name: sqlite | ||
with-sinks: false | ||
with-sources: false |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Experimental script for bulk generation of MaD models based on a list of projects. | ||
|
||
|
@@ -7,15 +8,27 @@ | |
import os.path | ||
import subprocess | ||
import sys | ||
from typing import NotRequired, TypedDict, List | ||
from typing import Required, TypedDict, List, Callable, Optional | ||
from concurrent.futures import ThreadPoolExecutor, as_completed | ||
import time | ||
import argparse | ||
import json | ||
import requests | ||
import zipfile | ||
import tarfile | ||
from functools import cmp_to_key | ||
import shutil | ||
|
||
def missing_module(module_name: str) -> None: | ||
print(f"ERROR: {module_name} is not installed. Please install it with 'pip install {module_name}'.") | ||
sys.exit(1) | ||
|
||
try: | ||
import yaml | ||
except ImportError: | ||
missing_module("pyyaml") | ||
|
||
try: | ||
import requests | ||
except ImportError: | ||
missing_module("requests") | ||
|
||
import generate_mad as mad | ||
|
||
|
@@ -28,23 +41,14 @@ | |
|
||
|
||
# A project to generate models for | ||
class Project(TypedDict): | ||
""" | ||
Type definition for projects (acquired via a GitHub repo) to model. | ||
|
||
Attributes: | ||
name: The name of the project | ||
git_repo: URL to the git repository | ||
git_tag: Optional Git tag to check out | ||
""" | ||
|
||
name: str | ||
git_repo: NotRequired[str] | ||
git_tag: NotRequired[str] | ||
with_sinks: NotRequired[bool] | ||
with_sinks: NotRequired[bool] | ||
with_summaries: NotRequired[bool] | ||
|
||
Project = TypedDict("Project", { | ||
"name": Required[str], | ||
"git-repo": str, | ||
"git-tag": str, | ||
"with-sinks": bool, | ||
"with-sources": bool, | ||
"with-summaries": bool, | ||
}, total=False) | ||
|
||
def should_generate_sinks(project: Project) -> bool: | ||
return project.get("with-sinks", True) | ||
|
@@ -63,14 +67,14 @@ def clone_project(project: Project) -> str: | |
Shallow clone a project into the build directory. | ||
|
||
Args: | ||
project: A dictionary containing project information with 'name', 'git_repo', and optional 'git_tag' keys. | ||
project: A dictionary containing project information with 'name', 'git-repo', and optional 'git-tag' keys. | ||
|
||
Returns: | ||
The path to the cloned project directory. | ||
""" | ||
name = project["name"] | ||
repo_url = project["git_repo"] | ||
git_tag = project.get("git_tag") | ||
repo_url = project["git-repo"] | ||
git_tag = project.get("git-tag") | ||
|
||
# Determine target directory | ||
target_dir = os.path.join(build_dir, name) | ||
|
@@ -103,6 +107,37 @@ def clone_project(project: Project) -> str: | |
return target_dir | ||
|
||
|
||
def run_in_parallel[T, U]( | ||
func: Callable[[T], U], | ||
items: List[T], | ||
*, | ||
on_error=lambda item, exc: None, | ||
error_summary=lambda failures: None, | ||
max_workers=8, | ||
) -> List[Optional[U]]: | ||
if not items: | ||
return [] | ||
max_workers = min(max_workers, len(items)) | ||
results = [None for _ in range(len(items))] | ||
with ThreadPoolExecutor(max_workers=max_workers) as executor: | ||
# Start cloning tasks and keep track of them | ||
futures = { | ||
executor.submit(func, item): index for index, item in enumerate(items) | ||
} | ||
# Process results as they complete | ||
for future in as_completed(futures): | ||
index = futures[future] | ||
try: | ||
results[index] = future.result() | ||
except Exception as e: | ||
on_error(items[index], e) | ||
failed = [item for item, result in zip(items, results) if result is None] | ||
if failed: | ||
error_summary(failed) | ||
sys.exit(1) | ||
return results | ||
|
||
|
||
def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]: | ||
""" | ||
Clone all projects in parallel. | ||
|
@@ -114,40 +149,19 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]: | |
List of (project, project_dir) pairs in the same order as the input projects | ||
""" | ||
start_time = time.time() | ||
max_workers = min(8, len(projects)) # Use at most 8 threads | ||
project_dirs_map = {} # Map to store results by project name | ||
|
||
with ThreadPoolExecutor(max_workers=max_workers) as executor: | ||
# Start cloning tasks and keep track of them | ||
future_to_project = { | ||
executor.submit(clone_project, project): project for project in projects | ||
} | ||
|
||
# Process results as they complete | ||
for future in as_completed(future_to_project): | ||
project = future_to_project[future] | ||
try: | ||
project_dir = future.result() | ||
project_dirs_map[project["name"]] = (project, project_dir) | ||
except Exception as e: | ||
print(f"ERROR: Failed to clone {project['name']}: {e}") | ||
|
||
if len(project_dirs_map) != len(projects): | ||
failed_projects = [ | ||
project["name"] | ||
for project in projects | ||
if project["name"] not in project_dirs_map | ||
] | ||
print( | ||
f"ERROR: Only {len(project_dirs_map)} out of {len(projects)} projects were cloned successfully. Failed projects: {', '.join(failed_projects)}" | ||
) | ||
sys.exit(1) | ||
|
||
project_dirs = [project_dirs_map[project["name"]] for project in projects] | ||
|
||
dirs = run_in_parallel( | ||
clone_project, | ||
projects, | ||
on_error=lambda project, exc: print( | ||
f"ERROR: Failed to clone project {project['name']}: {exc}" | ||
), | ||
error_summary=lambda failures: print( | ||
f"ERROR: Failed to clone {len(failures)} projects: {', '.join(p['name'] for p in failures)}" | ||
), | ||
) | ||
clone_time = time.time() - start_time | ||
print(f"Cloning completed in {clone_time:.2f} seconds") | ||
return project_dirs | ||
return list(zip(projects, dirs)) | ||
|
||
|
||
def build_database( | ||
|
@@ -159,7 +173,7 @@ def build_database( | |
Args: | ||
language: The language for which to build the database (e.g., "rust"). | ||
extractor_options: Additional options for the extractor. | ||
project: A dictionary containing project information with 'name' and 'git_repo' keys. | ||
project: A dictionary containing project information with 'name' and 'git-repo' keys. | ||
project_dir: Path to the CodeQL database. | ||
|
||
Returns: | ||
|
@@ -307,7 +321,10 @@ def pretty_name_from_artifact_name(artifact_name: str) -> str: | |
|
||
|
||
def download_dca_databases( | ||
experiment_name: str, pat: str, projects: List[Project] | ||
language: str, | ||
experiment_name: str, | ||
pat: str, | ||
projects: List[Project], | ||
) -> List[tuple[Project, str | None]]: | ||
""" | ||
Download databases from a DCA experiment. | ||
|
@@ -318,14 +335,14 @@ def download_dca_databases( | |
Returns: | ||
List of (project_name, database_dir) pairs, where database_dir is None if the download failed. | ||
""" | ||
database_results = {} | ||
print("\n=== Finding projects ===") | ||
response = get_json_from_github( | ||
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json", | ||
pat, | ||
) | ||
targets = response["targets"] | ||
project_map = {project["name"]: project for project in projects} | ||
artifact_map = {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There already is an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as they hold exactly the same values, taken the same way from the analyzed_database dict, I don't think that is confusing, so I'd rather leave this |
||
for data in targets.values(): | ||
downloads = data["downloads"] | ||
analyzed_database = downloads["analyzed_database"] | ||
|
@@ -336,6 +353,15 @@ def download_dca_databases( | |
print(f"Skipping {pretty_name} as it is not in the list of projects") | ||
continue | ||
|
||
if pretty_name in artifact_map: | ||
print( | ||
f"Skipping previous database {artifact_map[pretty_name]['artifact_name']} for {pretty_name}" | ||
) | ||
|
||
artifact_map[pretty_name] = analyzed_database | ||
|
||
def download_and_decompress(analyzed_database: dict) -> str: | ||
artifact_name = analyzed_database["artifact_name"] | ||
repository = analyzed_database["repository"] | ||
run_id = analyzed_database["run_id"] | ||
print(f"=== Finding artifact: {artifact_name} ===") | ||
|
@@ -351,27 +377,41 @@ def download_dca_databases( | |
artifact_zip_location = download_artifact( | ||
archive_download_url, artifact_name, pat | ||
) | ||
print(f"=== Extracting artifact: {artifact_name} ===") | ||
print(f"=== Decompressing artifact: {artifact_name} ===") | ||
# The database is in a zip file, which contains a tar.gz file with the DB | ||
# First we open the zip file | ||
with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref: | ||
artifact_unzipped_location = os.path.join(build_dir, artifact_name) | ||
# clean up any remnants of previous runs | ||
shutil.rmtree(artifact_unzipped_location, ignore_errors=True) | ||
# And then we extract it to build_dir/artifact_name | ||
zip_ref.extractall(artifact_unzipped_location) | ||
# And then we iterate over the contents of the extracted directory | ||
# and extract the tar.gz files inside it | ||
for entry in os.listdir(artifact_unzipped_location): | ||
artifact_tar_location = os.path.join(artifact_unzipped_location, entry) | ||
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref: | ||
# And we just untar it to the same directory as the zip file | ||
tar_ref.extractall(artifact_unzipped_location) | ||
database_results[pretty_name] = os.path.join( | ||
artifact_unzipped_location, remove_extension(entry) | ||
) | ||
# and extract the language tar.gz file inside it | ||
Comment on lines
389
to
+390
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're not iterating any more though, right? We're just unzipping the one correct Why did we iterate in the previous design? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it was just a kind of way to take the only containing file without specifying its name, but the name is easy to specify which is what I've done here. |
||
artifact_tar_location = os.path.join( | ||
artifact_unzipped_location, f"{language}.tar.gz" | ||
) | ||
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref: | ||
# And we just untar it to the same directory as the zip file | ||
tar_ref.extractall(artifact_unzipped_location) | ||
ret = os.path.join(artifact_unzipped_location, language) | ||
print(f"Decompression complete: {ret}") | ||
return ret | ||
|
||
results = run_in_parallel( | ||
download_and_decompress, | ||
list(artifact_map.values()), | ||
on_error=lambda db, exc: print( | ||
f"ERROR: Failed to download and decompress {db["artifact_name"]}: {exc}" | ||
), | ||
error_summary=lambda failures: print( | ||
f"ERROR: Failed to download {len(failures)} databases: {', '.join(item[0] for item in failures)}" | ||
), | ||
) | ||
|
||
print(f"\n=== Extracted {len(database_results)} databases ===") | ||
print(f"\n=== Fetched {len(results)} databases ===") | ||
|
||
return [(project, database_results[project["name"]]) for project in projects] | ||
return [(project_map[n], r) for n, r in zip(artifact_map, results)] | ||
|
||
|
||
def get_mad_destination_for_project(config, name: str) -> str: | ||
|
@@ -422,7 +462,9 @@ def main(config, args) -> None: | |
case "repo": | ||
extractor_options = config.get("extractor_options", []) | ||
database_results = build_databases_from_projects( | ||
language, extractor_options, projects | ||
language, | ||
extractor_options, | ||
projects, | ||
) | ||
case "dca": | ||
experiment_name = args.dca | ||
|
@@ -439,7 +481,10 @@ def main(config, args) -> None: | |
with open(args.pat, "r") as f: | ||
pat = f.read().strip() | ||
database_results = download_dca_databases( | ||
experiment_name, pat, projects | ||
language, | ||
experiment_name, | ||
pat, | ||
projects, | ||
) | ||
|
||
# Generate models for all projects | ||
|
@@ -492,9 +537,9 @@ def main(config, args) -> None: | |
sys.exit(1) | ||
try: | ||
with open(args.config, "r") as f: | ||
config = json.load(f) | ||
except json.JSONDecodeError as e: | ||
print(f"ERROR: Failed to parse JSON file {args.config}: {e}") | ||
config = yaml.safe_load(f) | ||
except yaml.YAMLError as e: | ||
print(f"ERROR: Failed to parse YAML file {args.config}: {e}") | ||
sys.exit(1) | ||
|
||
main(config, args) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
strategy: dca | ||
language: rust | ||
destination: rust/ql/lib/ext/generated | ||
# targets must have name specified and corresponding to the name in the DCA suite | ||
# they can optionally specify any of | ||
# with-sinks: false | ||
# with-sources: false | ||
# with-summaries: false | ||
# if a target has a dependency in this same list, it should be listed after that dependency | ||
targets: | ||
- name: rust | ||
- name: libc | ||
- name: log | ||
- name: memchr | ||
- name: once_cell | ||
- name: rand | ||
- name: smallvec | ||
- name: serde | ||
- name: tokio | ||
- name: reqwest | ||
- name: rocket | ||
- name: actix-web | ||
- name: hyper | ||
- name: clap | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a nice simple list, once everything is merged and stable I'll add a bunch more targets to it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. one thing to keep in mind is that at the moment this list needs to be topologically ordered with respect to dependencies (so later additions should depend on earlier ones and not the other way around). Possibly worth a comment here, now that this is yaml There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, just so you know, you can tweak what gets generated with any of with-sinks: false
with-sources: false
with-summaries: false (all are true by default) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nitpick] Exiting from within a utility function (via sys.exit in on_error handlers) can make the logic harder to test or reuse; consider returning errors and handling exit at the top level instead.
Copilot uses AI. Check for mistakes.