Skip to content

Rust: regenerate MaD files using DCA #19674

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cpp/bulk_generation_targets.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
language: cpp
strategy: dca
destination: cpp/ql/lib/ext/generated
targets:
- name: openssl
with-sinks: false
with-sources: false
- name: sqlite
with-sinks: false
with-sources: false
9 changes: 0 additions & 9 deletions cpp/misc/bulk_generation_targets.json

This file was deleted.

197 changes: 121 additions & 76 deletions misc/scripts/models-as-data/bulk_generate_mad.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
"""
Experimental script for bulk generation of MaD models based on a list of projects.

Expand All @@ -7,15 +8,27 @@
import os.path
import subprocess
import sys
from typing import NotRequired, TypedDict, List
from typing import Required, TypedDict, List, Callable, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import argparse
import json
import requests
import zipfile
import tarfile
from functools import cmp_to_key
import shutil

def missing_module(module_name: str) -> None:
print(f"ERROR: {module_name} is not installed. Please install it with 'pip install {module_name}'.")
sys.exit(1)

try:
import yaml
except ImportError:
missing_module("pyyaml")

try:
import requests
except ImportError:
missing_module("requests")

import generate_mad as mad

Expand All @@ -28,23 +41,14 @@


# A project to generate models for
class Project(TypedDict):
"""
Type definition for projects (acquired via a GitHub repo) to model.

Attributes:
name: The name of the project
git_repo: URL to the git repository
git_tag: Optional Git tag to check out
"""

name: str
git_repo: NotRequired[str]
git_tag: NotRequired[str]
with_sinks: NotRequired[bool]
with_sinks: NotRequired[bool]
with_summaries: NotRequired[bool]

Project = TypedDict("Project", {
"name": Required[str],
"git-repo": str,
"git-tag": str,
"with-sinks": bool,
"with-sources": bool,
"with-summaries": bool,
}, total=False)

def should_generate_sinks(project: Project) -> bool:
return project.get("with-sinks", True)
Expand All @@ -63,14 +67,14 @@ def clone_project(project: Project) -> str:
Shallow clone a project into the build directory.

Args:
project: A dictionary containing project information with 'name', 'git_repo', and optional 'git_tag' keys.
project: A dictionary containing project information with 'name', 'git-repo', and optional 'git-tag' keys.

Returns:
The path to the cloned project directory.
"""
name = project["name"]
repo_url = project["git_repo"]
git_tag = project.get("git_tag")
repo_url = project["git-repo"]
git_tag = project.get("git-tag")

# Determine target directory
target_dir = os.path.join(build_dir, name)
Expand Down Expand Up @@ -103,6 +107,37 @@ def clone_project(project: Project) -> str:
return target_dir


def run_in_parallel[T, U](
func: Callable[[T], U],
items: List[T],
*,
on_error=lambda item, exc: None,
error_summary=lambda failures: None,
max_workers=8,
) -> List[Optional[U]]:
if not items:
return []
max_workers = min(max_workers, len(items))
results = [None for _ in range(len(items))]
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Start cloning tasks and keep track of them
futures = {
executor.submit(func, item): index for index, item in enumerate(items)
}
# Process results as they complete
for future in as_completed(futures):
index = futures[future]
try:
results[index] = future.result()
except Exception as e:
on_error(items[index], e)
failed = [item for item, result in zip(items, results) if result is None]
if failed:
error_summary(failed)
sys.exit(1)
return results


def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
"""
Clone all projects in parallel.
Expand All @@ -114,40 +149,19 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
List of (project, project_dir) pairs in the same order as the input projects
"""
start_time = time.time()
max_workers = min(8, len(projects)) # Use at most 8 threads
project_dirs_map = {} # Map to store results by project name

with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Start cloning tasks and keep track of them
future_to_project = {
executor.submit(clone_project, project): project for project in projects
}

# Process results as they complete
for future in as_completed(future_to_project):
project = future_to_project[future]
try:
project_dir = future.result()
project_dirs_map[project["name"]] = (project, project_dir)
except Exception as e:
print(f"ERROR: Failed to clone {project['name']}: {e}")

if len(project_dirs_map) != len(projects):
failed_projects = [
project["name"]
for project in projects
if project["name"] not in project_dirs_map
]
print(
f"ERROR: Only {len(project_dirs_map)} out of {len(projects)} projects were cloned successfully. Failed projects: {', '.join(failed_projects)}"
)
sys.exit(1)

project_dirs = [project_dirs_map[project["name"]] for project in projects]

dirs = run_in_parallel(
Copy link
Preview

Copilot AI Jun 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Exiting from within a utility function (via sys.exit in on_error handlers) can make the logic harder to test or reuse; consider returning errors and handling exit at the top level instead.

Suggested change
dirs = run_in_parallel(
failed = run_in_parallel(

Copilot uses AI. Check for mistakes.

clone_project,
projects,
on_error=lambda project, exc: print(
f"ERROR: Failed to clone project {project['name']}: {exc}"
),
error_summary=lambda failures: print(
f"ERROR: Failed to clone {len(failures)} projects: {', '.join(p['name'] for p in failures)}"
),
)
clone_time = time.time() - start_time
print(f"Cloning completed in {clone_time:.2f} seconds")
return project_dirs
return list(zip(projects, dirs))


def build_database(
Expand All @@ -159,7 +173,7 @@ def build_database(
Args:
language: The language for which to build the database (e.g., "rust").
extractor_options: Additional options for the extractor.
project: A dictionary containing project information with 'name' and 'git_repo' keys.
project: A dictionary containing project information with 'name' and 'git-repo' keys.
project_dir: Path to the CodeQL database.

Returns:
Expand Down Expand Up @@ -307,7 +321,10 @@ def pretty_name_from_artifact_name(artifact_name: str) -> str:


def download_dca_databases(
experiment_name: str, pat: str, projects: List[Project]
language: str,
experiment_name: str,
pat: str,
projects: List[Project],
) -> List[tuple[Project, str | None]]:
"""
Download databases from a DCA experiment.
Expand All @@ -318,14 +335,14 @@ def download_dca_databases(
Returns:
List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
"""
database_results = {}
print("\n=== Finding projects ===")
response = get_json_from_github(
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
pat,
)
targets = response["targets"]
project_map = {project["name"]: project for project in projects}
artifact_map = {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There already is an artifact_name inside download_and_extract which end up shadowing this one. Could we rename one of them just to make this a bit clearer?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as they hold exactly the same values, taken the same way from the analyzed_database dict, I don't think that is confusing, so I'd rather leave this

for data in targets.values():
downloads = data["downloads"]
analyzed_database = downloads["analyzed_database"]
Expand All @@ -336,6 +353,15 @@ def download_dca_databases(
print(f"Skipping {pretty_name} as it is not in the list of projects")
continue

if pretty_name in artifact_map:
print(
f"Skipping previous database {artifact_map[pretty_name]['artifact_name']} for {pretty_name}"
)

artifact_map[pretty_name] = analyzed_database

def download_and_decompress(analyzed_database: dict) -> str:
artifact_name = analyzed_database["artifact_name"]
repository = analyzed_database["repository"]
run_id = analyzed_database["run_id"]
print(f"=== Finding artifact: {artifact_name} ===")
Expand All @@ -351,27 +377,41 @@ def download_dca_databases(
artifact_zip_location = download_artifact(
archive_download_url, artifact_name, pat
)
print(f"=== Extracting artifact: {artifact_name} ===")
print(f"=== Decompressing artifact: {artifact_name} ===")
# The database is in a zip file, which contains a tar.gz file with the DB
# First we open the zip file
with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref:
artifact_unzipped_location = os.path.join(build_dir, artifact_name)
# clean up any remnants of previous runs
shutil.rmtree(artifact_unzipped_location, ignore_errors=True)
# And then we extract it to build_dir/artifact_name
zip_ref.extractall(artifact_unzipped_location)
# And then we iterate over the contents of the extracted directory
# and extract the tar.gz files inside it
for entry in os.listdir(artifact_unzipped_location):
artifact_tar_location = os.path.join(artifact_unzipped_location, entry)
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
# And we just untar it to the same directory as the zip file
tar_ref.extractall(artifact_unzipped_location)
database_results[pretty_name] = os.path.join(
artifact_unzipped_location, remove_extension(entry)
)
# and extract the language tar.gz file inside it
Comment on lines 389 to +390
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're not iterating any more though, right? We're just unzipping the one correct .tar.gz?

Why did we iterate in the previous design?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it was just a kind of way to take the only containing file without specifying its name, but the name is easy to specify which is what I've done here.

artifact_tar_location = os.path.join(
artifact_unzipped_location, f"{language}.tar.gz"
)
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
# And we just untar it to the same directory as the zip file
tar_ref.extractall(artifact_unzipped_location)
ret = os.path.join(artifact_unzipped_location, language)
print(f"Decompression complete: {ret}")
return ret

results = run_in_parallel(
download_and_decompress,
list(artifact_map.values()),
on_error=lambda db, exc: print(
f"ERROR: Failed to download and decompress {db["artifact_name"]}: {exc}"
),
error_summary=lambda failures: print(
f"ERROR: Failed to download {len(failures)} databases: {', '.join(item[0] for item in failures)}"
),
)

print(f"\n=== Extracted {len(database_results)} databases ===")
print(f"\n=== Fetched {len(results)} databases ===")

return [(project, database_results[project["name"]]) for project in projects]
return [(project_map[n], r) for n, r in zip(artifact_map, results)]


def get_mad_destination_for_project(config, name: str) -> str:
Expand Down Expand Up @@ -422,7 +462,9 @@ def main(config, args) -> None:
case "repo":
extractor_options = config.get("extractor_options", [])
database_results = build_databases_from_projects(
language, extractor_options, projects
language,
extractor_options,
projects,
)
case "dca":
experiment_name = args.dca
Expand All @@ -439,7 +481,10 @@ def main(config, args) -> None:
with open(args.pat, "r") as f:
pat = f.read().strip()
database_results = download_dca_databases(
experiment_name, pat, projects
language,
experiment_name,
pat,
projects,
)

# Generate models for all projects
Expand Down Expand Up @@ -492,9 +537,9 @@ def main(config, args) -> None:
sys.exit(1)
try:
with open(args.config, "r") as f:
config = json.load(f)
except json.JSONDecodeError as e:
print(f"ERROR: Failed to parse JSON file {args.config}: {e}")
config = yaml.safe_load(f)
except yaml.YAMLError as e:
print(f"ERROR: Failed to parse YAML file {args.config}: {e}")
sys.exit(1)

main(config, args)
24 changes: 24 additions & 0 deletions rust/bulk_generation_targets.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
strategy: dca
language: rust
destination: rust/ql/lib/ext/generated
# targets must have name specified and corresponding to the name in the DCA suite
# they can optionally specify any of
# with-sinks: false
# with-sources: false
# with-summaries: false
# if a target has a dependency in this same list, it should be listed after that dependency
targets:
- name: rust
- name: libc
- name: log
- name: memchr
- name: once_cell
- name: rand
- name: smallvec
- name: serde
- name: tokio
- name: reqwest
- name: rocket
- name: actix-web
- name: hyper
- name: clap
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a nice simple list, once everything is merged and stable I'll add a bunch more targets to it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one thing to keep in mind is that at the moment this list needs to be topologically ordered with respect to dependencies (so later additions should depend on earlier ones and not the other way around). Possibly worth a comment here, now that this is yaml

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, just so you know, you can tweak what gets generated with any of

with-sinks: false
with-sources: false
with-summaries: false

(all are true by default)

Loading