[scan-build-py] Update scan-build-py to allow outputing as SARIF

Daniel Hwang · zeroomega · commit d72859ffa237 · 2021-02-07T18:25:50.000-08:00
clang static analysis reports can be generated in html, plist, or sarif format. This updates scan-build-py to be able to specify SARIF as the desired output format, as previously it only support plist and html formats. Differential Revision: https://reviews.llvm.org/D94251
diff --git a/clang/tools/scan-build-py/libscanbuild/analyze.py b/clang/tools/scan-build-py/libscanbuild/analyze.py
@@ -52,7 +52,8 @@ def scan_build():
 
     args = parse_args_for_scan_build()
     # will re-assign the report directory as new output
-    with report_directory(args.output, args.keep_empty) as args.output:
+    with report_directory(
+            args.output, args.keep_empty, args.output_format) as args.output:
         # Run against a build command. there are cases, when analyzer run
         # is not required. But we need to set up everything for the
         # wrappers, because 'configure' needs to capture the CC/CXX values
@@ -79,7 +80,7 @@ def analyze_build():
 
     args = parse_args_for_analyze_build()
     # will re-assign the report directory as new output
-    with report_directory(args.output, args.keep_empty) as args.output:
+    with report_directory(args.output, args.keep_empty, args.output_format) as args.output:
         # Run the analyzer against a compilation db.
         govern_analyzer_runs(args)
         # Cover report generation and bug counting.
@@ -336,7 +337,7 @@ def analyze_compiler_wrapper_impl(result, execution):
 
 
 @contextlib.contextmanager
-def report_directory(hint, keep):
+def report_directory(hint, keep, output_format):
     """ Responsible for the report directory.
 
     hint -- could specify the parent directory of the output directory.
@@ -355,7 +356,11 @@ def report_directory(hint, keep):
         yield name
     finally:
         if os.listdir(name):
-            msg = "Run 'scan-view %s' to examine bug reports."
+            if output_format != 'sarif':
+                # 'scan-view' currently does not support sarif format.
+                msg = "Run 'scan-view %s' to examine bug reports."
+            else:
+                msg = "View result at %s/results-merged.sarif."
             keep = True
         else:
             if keep:
@@ -433,7 +438,7 @@ def wrapper(*args, **kwargs):
           'direct_args',  # arguments from command line
           'force_debug',  # kill non debug macros
           'output_dir',  # where generated report files shall go
-          'output_format',  # it's 'plist', 'html', both or plist-multi-file
+          'output_format',  # it's 'plist', 'html', 'plist-html', 'plist-multi-file', or 'sarif'
           'output_failures',  # generate crash reports or not
           'ctu'])  # ctu control options
 def run(opts):
@@ -537,6 +542,12 @@ def target():
                                               dir=opts['output_dir'])
             os.close(handle)
             return name
+        elif opts['output_format'] == 'sarif':
+            (handle, name) = tempfile.mkstemp(prefix='result-',
+                                              suffix='.sarif',
+                                              dir=opts['output_dir'])
+            os.close(handle)
+            return name
         return opts['output_dir']
 
     try:
diff --git a/clang/tools/scan-build-py/libscanbuild/arguments.py b/clang/tools/scan-build-py/libscanbuild/arguments.py
@@ -244,6 +244,14 @@ def create_analyze_parser(from_build_command):
         action='store_const',
         help="""Cause the results as a set of .plist files with extra
         information on related files.""")
+    format_group.add_argument(
+        '--sarif',
+        '-sarif',
+        dest='output_format',
+        const='sarif',
+        default='html',
+        action='store_const',
+        help="""Cause the results as a result.sarif file.""")
 
     advanced = parser.add_argument_group('advanced options')
     advanced.add_argument(
diff --git a/clang/tools/scan-build-py/libscanbuild/report.py b/clang/tools/scan-build-py/libscanbuild/report.py
@@ -27,6 +27,7 @@ def document(args):
     """ Generates cover report and returns the number of bugs/crashes. """
 
     html_reports_available = args.output_format in {'html', 'plist-html'}
+    sarif_reports_available = args.output_format in {'sarif'}
 
     logging.debug('count crashes and bugs')
     crash_count = sum(1 for _ in read_crashes(args.output))
@@ -57,6 +58,11 @@ def document(args):
         finally:
             for fragment in fragments:
                 os.remove(fragment)
+
+    if sarif_reports_available:
+        logging.debug('merging sarif files')
+        merge_sarif_files(args.output)
+
     return result
 
 
@@ -277,6 +283,98 @@ def empty(file_name):
             if not duplicate(bug):
                 yield bug
 
+def merge_sarif_files(output_dir, sort_files=False):
+    """ Reads and merges all .sarif files in the given output directory.
+
+    Each sarif file in the output directory is understood as a single run
+    and thus appear separate in the top level runs array. This requires
+    modifying the run index of any embedded links in messages.
+    """
+
+    def empty(file_name):
+        return os.stat(file_name).st_size == 0
+
+    def update_sarif_object(sarif_object, runs_count_offset):
+        """
+            Given a SARIF object, checks its dictionary entries for a 'message' property.
+            If it exists, updates the message index of embedded links in the run index.
+
+            Recursively looks through entries in the dictionary.
+        """
+        if not isinstance(sarif_object, dict):
+            return sarif_object
+
+        if 'message' in sarif_object:
+            sarif_object['message'] = match_and_update_run(sarif_object['message'], runs_count_offset)
+
+        for key in sarif_object:
+            if isinstance(sarif_object[key], list):
+                # iterate through subobjects and update it.
+                arr = [update_sarif_object(entry, runs_count_offset) for entry in sarif_object[key]]
+                sarif_object[key] = arr
+            elif isinstance(sarif_object[key], dict):
+                sarif_object[key] = update_sarif_object(sarif_object[key], runs_count_offset)
+            else:
+                # do nothing
+                pass
+
+        return sarif_object
+
+
+    def match_and_update_run(message, runs_count_offset):
+        """
+            Given a SARIF message object, checks if the text property contains an embedded link and
+            updates the run index if necessary.
+        """
+        if 'text' not in message:
+            return message
+
+        # we only merge runs, so we only need to update the run index
+        pattern = re.compile(r'sarif:/runs/(\d+)')
+
+        text = message['text']
+        matches = re.finditer(pattern, text)
+        matches_list = list(matches)
+
+        # update matches from right to left to make increasing character length (9->10) smoother
+        for idx in range(len(matches_list) - 1, -1, -1):
+            match = matches_list[idx]
+            new_run_count = str(runs_count_offset + int(match.group(1)))
+            text = text[0:match.start(1)] + new_run_count + text[match.end(1):]
+
+        message['text'] = text
+        return message
+
+
+
+    sarif_files = (file for file in glob.iglob(os.path.join(output_dir, '*.sarif')) if not empty(file))
+    # exposed for testing since the order of files returned by glob is not guaranteed to be sorted
+    if sort_files:
+        sarif_files = list(sarif_files)
+        sarif_files.sort()
+
+    runs_count = 0
+    merged = {}
+    for sarif_file in sarif_files:
+        with open(sarif_file) as fp:
+            sarif = json.load(fp)
+            if 'runs' not in sarif:
+                continue
+
+            # start with the first file
+            if not merged:
+                merged = sarif
+            else:
+                # extract the run and append it to the merged output
+                for run in sarif['runs']:
+                    new_run = update_sarif_object(run, runs_count)
+                    merged['runs'].append(new_run)
+
+            runs_count += len(sarif['runs'])
+
+    with open(os.path.join(output_dir, 'results-merged.sarif'), 'w') as out:
+        json.dump(merged, out, indent=4, sort_keys=True)
+
 
 def parse_bug_plist(filename):
     """ Returns the generator of bugs from a single .plist file. """
diff --git a/clang/tools/scan-build-py/tests/unit/test_analyze.py b/clang/tools/scan-build-py/tests/unit/test_analyze.py
@@ -128,7 +128,7 @@ def call(self, params):
 class RunAnalyzerTest(unittest.TestCase):
 
     @staticmethod
-    def run_analyzer(content, failures_report):
+    def run_analyzer(content, failures_report, output_format='plist'):
         with libear.TemporaryDirectory() as tmpdir:
             filename = os.path.join(tmpdir, 'test.cpp')
             with open(filename, 'w') as handle:
@@ -141,31 +141,46 @@ def run_analyzer(content, failures_report):
                 'direct_args': [],
                 'file': filename,
                 'output_dir': tmpdir,
-                'output_format': 'plist',
+                'output_format': output_format,
                 'output_failures': failures_report
             }
             spy = Spy()
             result = sut.run_analyzer(opts, spy.call)
-            return (result, spy.arg)
+            output_files = []
+            for entry in os.listdir(tmpdir):
+                output_files.append(entry)
+            return (result, spy.arg, output_files)
 
     def test_run_analyzer(self):
         content = "int div(int n, int d) { return n / d; }"
-        (result, fwds) = RunAnalyzerTest.run_analyzer(content, False)
+        (result, fwds, _) = RunAnalyzerTest.run_analyzer(content, False)
         self.assertEqual(None, fwds)
         self.assertEqual(0, result['exit_code'])
 
     def test_run_analyzer_crash(self):
         content = "int div(int n, int d) { return n / d }"
-        (result, fwds) = RunAnalyzerTest.run_analyzer(content, False)
+        (result, fwds, _) = RunAnalyzerTest.run_analyzer(content, False)
         self.assertEqual(None, fwds)
         self.assertEqual(1, result['exit_code'])
 
     def test_run_analyzer_crash_and_forwarded(self):
         content = "int div(int n, int d) { return n / d }"
-        (_, fwds) = RunAnalyzerTest.run_analyzer(content, True)
+        (_, fwds, _) = RunAnalyzerTest.run_analyzer(content, True)
         self.assertEqual(1, fwds['exit_code'])
         self.assertTrue(len(fwds['error_output']) > 0)
 
+    def test_run_analyzer_with_sarif(self):
+        content = "int div(int n, int d) { return n / d; }"
+        (result, fwds, output_files) = RunAnalyzerTest.run_analyzer(content, False, output_format='sarif')
+        self.assertEqual(None, fwds)
+        self.assertEqual(0, result['exit_code'])
+
+        pattern = re.compile(r'^result-.+\.sarif$')
+        for f in output_files:
+            if re.match(pattern, f):
+                return
+        self.fail('no result sarif files found in output')
+
 
 class ReportFailureTest(unittest.TestCase):
 
diff --git a/clang/tools/scan-build-py/tests/unit/test_report.py b/clang/tools/scan-build-py/tests/unit/test_report.py