github · redsun82 · Jun 4, 2025 · Jun 5, 2025 · Jun 5, 2025 · Jun 5, 2025
@@ -0,0 +1,10 @@
+language: cpp
+strategy: dca
+destination: cpp/ql/lib/ext/generated
+targets:
+- name: openssl
+  with-sinks: false
+  with-sources: false
+- name: sqlite
+  with-sinks: false
+  with-sources: false
diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
 Experimental script for bulk generation of MaD models based on a list of projects.
 
@@ -7,15 +8,27 @@
 import os.path
 import subprocess
 import sys
-from typing import NotRequired, TypedDict, List
+from typing import Required, TypedDict, List, Callable, Optional
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import time
 import argparse
-import json
-import requests
 import zipfile
 import tarfile
-from functools import cmp_to_key
+import shutil
+
+def missing_module(module_name: str) -> None:
+    print(f"ERROR: {module_name} is not installed. Please install it with 'pip install {module_name}'.")
+    sys.exit(1)
+
+try:
+    import yaml
+except ImportError:
+    missing_module("pyyaml")
+
+try:
+    import requests
+except ImportError:
+    missing_module("requests")
 
 import generate_mad as mad
 
@@ -28,23 +41,14 @@
 
 
 # A project to generate models for
-class Project(TypedDict):
-    """
-    Type definition for projects (acquired via a GitHub repo) to model.
-
-    Attributes:
-        name: The name of the project
-        git_repo: URL to the git repository
-        git_tag: Optional Git tag to check out
-    """
-
-    name: str
-    git_repo: NotRequired[str]
-    git_tag: NotRequired[str]
-    with_sinks: NotRequired[bool]
-    with_sinks: NotRequired[bool]
-    with_summaries: NotRequired[bool]
-
+Project = TypedDict("Project", {
+    "name": Required[str],
+    "git-repo": str,
+    "git-tag": str,
+    "with-sinks": bool,
+    "with-sources": bool,
+    "with-summaries": bool,
+}, total=False)
 
 def should_generate_sinks(project: Project) -> bool:
     return project.get("with-sinks", True)
@@ -63,14 +67,14 @@ def clone_project(project: Project) -> str:
     Shallow clone a project into the build directory.
 
     Args:
-        project: A dictionary containing project information with 'name', 'git_repo', and optional 'git_tag' keys.
+        project: A dictionary containing project information with 'name', 'git-repo', and optional 'git-tag' keys.
 
     Returns:
         The path to the cloned project directory.
     """
     name = project["name"]
-    repo_url = project["git_repo"]
-    git_tag = project.get("git_tag")
+    repo_url = project["git-repo"]
+    git_tag = project.get("git-tag")
 
     # Determine target directory
     target_dir = os.path.join(build_dir, name)
@@ -103,6 +107,37 @@ def clone_project(project: Project) -> str:
     return target_dir
 
 
+def run_in_parallel[T, U](
+    func: Callable[[T], U],
+    items: List[T],
+    *,
+    on_error=lambda item, exc: None,
+    error_summary=lambda failures: None,
+    max_workers=8,
+) -> List[Optional[U]]:
+    if not items:
+        return []
+    max_workers = min(max_workers, len(items))
+    results = [None for _ in range(len(items))]
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        # Start cloning tasks and keep track of them
+        futures = {
+            executor.submit(func, item): index for index, item in enumerate(items)
+        }
+        # Process results as they complete
+        for future in as_completed(futures):
+            index = futures[future]
+            try:
+                results[index] = future.result()
+            except Exception as e:
+                on_error(items[index], e)
+    failed = [item for item, result in zip(items, results) if result is None]
+    if failed:
+        error_summary(failed)
+        sys.exit(1)
+    return results
+
+
 def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
     """
     Clone all projects in parallel.
@@ -114,40 +149,19 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
         List of (project, project_dir) pairs in the same order as the input projects
     """
     start_time = time.time()
-    max_workers = min(8, len(projects))  # Use at most 8 threads
-    project_dirs_map = {}  # Map to store results by project name
-
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        # Start cloning tasks and keep track of them
-        future_to_project = {
-            executor.submit(clone_project, project): project for project in projects
-        }
-
-        # Process results as they complete
-        for future in as_completed(future_to_project):
-            project = future_to_project[future]
-            try:
-                project_dir = future.result()
-                project_dirs_map[project["name"]] = (project, project_dir)
-            except Exception as e:
-                print(f"ERROR: Failed to clone {project['name']}: {e}")
-
-    if len(project_dirs_map) != len(projects):
-        failed_projects = [
-            project["name"]
-            for project in projects
-            if project["name"] not in project_dirs_map
-        ]
-        print(
-            f"ERROR: Only {len(project_dirs_map)} out of {len(projects)} projects were cloned successfully. Failed projects: {', '.join(failed_projects)}"
-        )
-        sys.exit(1)
-
-    project_dirs = [project_dirs_map[project["name"]] for project in projects]
-
+    dirs = run_in_parallel(
-    dirs = run_in_parallel(
+    failed = run_in_parallel(
-    dirs = run_in_parallel(
+    failed = run_in_parallel(
+        clone_project,
+        projects,
+        on_error=lambda project, exc: print(
+            f"ERROR: Failed to clone project {project['name']}: {exc}"
+        ),
+        error_summary=lambda failures: print(
+            f"ERROR: Failed to clone {len(failures)} projects: {', '.join(p['name'] for p in failures)}"
+        ),
+    )
     clone_time = time.time() - start_time
     print(f"Cloning completed in {clone_time:.2f} seconds")
-    return project_dirs
+    return list(zip(projects, dirs))
 
 
 def build_database(
@@ -159,7 +173,7 @@ def build_database(
     Args:
         language: The language for which to build the database (e.g., "rust").
         extractor_options: Additional options for the extractor.
-        project: A dictionary containing project information with 'name' and 'git_repo' keys.
+        project: A dictionary containing project information with 'name' and 'git-repo' keys.
         project_dir: Path to the CodeQL database.
 
     Returns:
@@ -307,7 +321,10 @@ def pretty_name_from_artifact_name(artifact_name: str) -> str:
 
 
 def download_dca_databases(
-    experiment_name: str, pat: str, projects: List[Project]
+    language: str,
+    experiment_name: str,
+    pat: str,
+    projects: List[Project],
 ) -> List[tuple[Project, str | None]]:
     """
     Download databases from a DCA experiment.
@@ -318,14 +335,14 @@ def download_dca_databases(
     Returns:
         List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
     """
-    database_results = {}
     print("\n=== Finding projects ===")
     response = get_json_from_github(
         f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
         pat,
     )
     targets = response["targets"]
     project_map = {project["name"]: project for project in projects}
+    artifact_map = {}
     for data in targets.values():
         downloads = data["downloads"]
         analyzed_database = downloads["analyzed_database"]
@@ -336,6 +353,15 @@ def download_dca_databases(
             print(f"Skipping {pretty_name} as it is not in the list of projects")
             continue
 
+        if pretty_name in artifact_map:
+            print(
+                f"Skipping previous database {artifact_map[pretty_name]['artifact_name']} for {pretty_name}"
+            )
+
+        artifact_map[pretty_name] = analyzed_database
+
+    def download_and_decompress(analyzed_database: dict) -> str:
+        artifact_name = analyzed_database["artifact_name"]
         repository = analyzed_database["repository"]
         run_id = analyzed_database["run_id"]
         print(f"=== Finding artifact: {artifact_name} ===")
@@ -351,27 +377,41 @@ def download_dca_databases(
         artifact_zip_location = download_artifact(
             archive_download_url, artifact_name, pat
         )
-        print(f"=== Extracting artifact: {artifact_name} ===")
+        print(f"=== Decompressing artifact: {artifact_name} ===")
         # The database is in a zip file, which contains a tar.gz file with the DB
         # First we open the zip file
         with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref:
             artifact_unzipped_location = os.path.join(build_dir, artifact_name)
+            # clean up any remnants of previous runs
+            shutil.rmtree(artifact_unzipped_location, ignore_errors=True)
             # And then we extract it to build_dir/artifact_name
             zip_ref.extractall(artifact_unzipped_location)
             # And then we iterate over the contents of the extracted directory
-            # and extract the tar.gz files inside it
-            for entry in os.listdir(artifact_unzipped_location):
-                artifact_tar_location = os.path.join(artifact_unzipped_location, entry)
-                with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
-                    # And we just untar it to the same directory as the zip file
-                    tar_ref.extractall(artifact_unzipped_location)
-                    database_results[pretty_name] = os.path.join(
-                        artifact_unzipped_location, remove_extension(entry)
-                    )
+            # and extract the language tar.gz file inside it
+            artifact_tar_location = os.path.join(
+                artifact_unzipped_location, f"{language}.tar.gz"
+            )
+            with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
+                # And we just untar it to the same directory as the zip file
+                tar_ref.extractall(artifact_unzipped_location)
+        ret = os.path.join(artifact_unzipped_location, language)
+        print(f"Decompression complete: {ret}")
+        return ret
+
+    results = run_in_parallel(
+        download_and_decompress,
+        list(artifact_map.values()),
+        on_error=lambda db, exc: print(
+            f"ERROR: Failed to download and decompress {db["artifact_name"]}: {exc}"
+        ),
+        error_summary=lambda failures: print(
+            f"ERROR: Failed to download {len(failures)} databases: {', '.join(item[0] for item in failures)}"
+        ),
+    )
 
-    print(f"\n=== Extracted {len(database_results)} databases ===")
+    print(f"\n=== Fetched {len(results)} databases ===")
 
-    return [(project, database_results[project["name"]]) for project in projects]
+    return [(project_map[n], r) for n, r in zip(artifact_map, results)]
 
 
 def get_mad_destination_for_project(config, name: str) -> str:
@@ -422,7 +462,9 @@ def main(config, args) -> None:
         case "repo":
             extractor_options = config.get("extractor_options", [])
             database_results = build_databases_from_projects(
-                language, extractor_options, projects
+                language,
+                extractor_options,
+                projects,
             )
         case "dca":
             experiment_name = args.dca
@@ -439,7 +481,10 @@ def main(config, args) -> None:
             with open(args.pat, "r") as f:
                 pat = f.read().strip()
                 database_results = download_dca_databases(
-                    experiment_name, pat, projects
+                    language,
+                    experiment_name,
+                    pat,
+                    projects,
                 )
 
     # Generate models for all projects
@@ -492,9 +537,9 @@ def main(config, args) -> None:
         sys.exit(1)
     try:
         with open(args.config, "r") as f:
-            config = json.load(f)
-    except json.JSONDecodeError as e:
-        print(f"ERROR: Failed to parse JSON file {args.config}: {e}")
+            config = yaml.safe_load(f)
+    except yaml.YAMLError as e:
+        print(f"ERROR: Failed to parse YAML file {args.config}: {e}")
         sys.exit(1)
 
     main(config, args)
@@ -0,0 +1,24 @@
+strategy: dca
+language: rust
+destination: rust/ql/lib/ext/generated
+# targets must have name specified and corresponding to the name in the DCA suite
+# they can optionally specify any of
+#   with-sinks: false
+#   with-sources: false
+#   with-summaries: false
+# if a target has a dependency in this same list, it should be listed after that dependency
+targets:
+- name: rust
+- name: libc
+- name: log
+- name: memchr
+- name: once_cell
+- name: rand
+- name: smallvec
+- name: serde
+- name: tokio
+- name: reqwest
+- name: rocket
+- name: actix-web
+- name: hyper
+- name: clap