Skip to content

Commit f3c54c2

Browse files
committed
Scancode: Fix false positive reported by scancode output analyser script
ScanCode can possibly return many licenses found for a single file scanned. This commit ensures that the file is not reported as lacking a permissive license if at least one license found in it is permissive. Previously the script was reporting an issue if it found at least one license in a file that was not permissive. Additionally catch more errors and provide specific details about failures. Provide unitest.
1 parent 3c6c8ae commit f3c54c2

File tree

6 files changed

+1274
-1204
lines changed

6 files changed

+1274
-1204
lines changed

tools/test/travis-ci/scancode-evaluate.py

Lines changed: 91 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
distributed under the License is distributed on an "AS IS" BASIS,
1414
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1515
See the License for the specific language governing permissions and
16-
limitations
16+
limitations
1717
"""
1818

19-
# Asumptions for this script:
19+
# Asumptions for this script:
2020
# 1. directory_name is scanned directory.
2121
# Files are copied to this directory with full tree. As result, if we find
2222
# license offender, we can have full path (just scrape directory_name). We do this
@@ -29,6 +29,7 @@
2929
import os.path
3030
import logging
3131
import re
32+
import ntpath
3233

3334
userlog = logging.getLogger("scancode-evaluate")
3435
userlog.setLevel(logging.INFO)
@@ -40,92 +41,117 @@
4041
MISSING_PERMISIVE_LICENSE_TEXT = "Non-permissive license"
4142
MISSING_SPDX_TEXT = "Missing SPDX license identifier"
4243

43-
def license_check(directory_name, file):
44-
""" Check licenses in the scancode json file for specified directory
44+
class FileDecodeError(Exception):
45+
"""An exception for a failure to decode a file being tested."""
46+
47+
def path_leaf(path):
48+
"""Return the leaf of a path."""
49+
head, tail = ntpath.split(path)
50+
# Ensure the correct file name is returned if the file ends with a slash
51+
return tail or ntpath.basename(head)
52+
53+
def has_permissive_text_in_scancode_output(scancode_output_data_file_licenses):
54+
"""Returns true if at list one license in the scancode output is permissive."""
55+
return any(
56+
scancode_output_data_file_license['category'] == 'Permissive'
57+
for scancode_output_data_file_license in scancode_output_data_file_licenses
58+
)
59+
60+
def has_spdx_text_in_scancode_output(scancode_output_data_file_licenses):
61+
"""Returns true if at least one license in the scancode output has the spdx identifier."""
62+
return any(
63+
'spdx' in scancode_output_data_file_license['matched_rule']['identifier']
64+
for scancode_output_data_file_license in scancode_output_data_file_licenses
65+
)
66+
67+
def has_spdx_text_in_analysed_file(file):
68+
"""Returns true if the file analysed by ScanCode contains SPDX identifier."""
69+
try:
70+
with open(file, 'r') as read_file:
71+
filetext = read_file.read()
72+
except UnicodeDecodeError:
73+
raise FileDecodeError(
74+
"Unable to look for SPDX text in `{}`:".format(file)
75+
)
76+
77+
return re.findall("SPDX-License-Identifier:?", filetext)
78+
79+
def license_check(scancode_output):
80+
"""Check licenses in the scancode json file for specified directory.
4581
4682
This function does not verify if file exists, should be done prior the call.
4783
4884
Args:
49-
directory_name - where scancode was run, used to scrape this from paths
5085
file - scancode json output file (output from scancode --license --json-pp)
5186
52-
Returns:
87+
Returns:
5388
0 if nothing found
5489
>0 - count how many license isses found
5590
-1 if any error in file licenses found
5691
"""
5792

5893
offenders = []
5994
try:
60-
# find all licenses in the files, must be licensed and permissive
61-
with open(file, 'r') as scancode_output:
62-
results = json.load(scancode_output)
63-
except ValueError:
64-
userlog.warning("JSON could not be decoded")
95+
with open(scancode_output, 'r') as read_file:
96+
scancode_output_data = json.load(read_file)
97+
except json.JSONDecodeError as jex:
98+
userlog.warning("JSON could not be decoded, Invalid JSON in body: %s", jex)
6599
return -1
66100

67-
try:
68-
for file in results['files']:
69-
license_offender = {}
70-
license_offender['file'] = file
71-
# ignore directory, not relevant here
72-
if license_offender['file']['type'] == 'directory':
101+
if 'files' not in scancode_output_data:
102+
userlog.warning("Missing `files` attribute in %s" % (scancode_output))
103+
return -1
104+
105+
for scancode_output_data_file in scancode_output_data['files']:
106+
try:
107+
if scancode_output_data_file['type'] != 'file':
73108
continue
74-
if not license_offender['file']['licenses']:
75-
license_offender['reason'] = MISSING_LICENSE_TEXT
76-
offenders.append(license_offender.copy())
109+
except KeyError as e:
110+
userlog.warning("Could not find %s attribute in %s" % (str(e), scancode_output))
111+
return -1
112+
113+
try:
114+
if not scancode_output_data_file['licenses']:
115+
scancode_output_data_file['fail_reason'] = MISSING_LICENSE_TEXT
116+
offenders.append(scancode_output_data_file)
117+
# check the next file in the scancode output
77118
continue
78-
79-
found_spdx = spdx_check(offenders, license_offender)
80-
81-
if not found_spdx:
119+
except KeyError as e:
120+
userlog.warning("Could not find %s attribute in %s" % (str(e), scancode_output))
121+
return -1
122+
123+
try:
124+
if not has_permissive_text_in_scancode_output(scancode_output_data_file['licenses']):
125+
scancode_output_data_file['fail_reason'] = MISSING_PERMISIVE_LICENSE_TEXT
126+
offenders.append(scancode_output_data_file)
127+
except KeyError as e:
128+
userlog.warning("Could not find %s attribute in %s" % (str(e), scancode_output))
129+
return -1
130+
131+
try:
132+
if not has_spdx_text_in_scancode_output(scancode_output_data_file['licenses']):
133+
# Scancode does not recognize license notice in Python file headers.
134+
# Issue: https://github.com/nexB/scancode-toolkit/issues/1913
135+
# Therefore check if the file tested by ScanCode actually has a licence notice.
82136
try:
83-
# Issue reported here https://github.com/nexB/scancode-toolkit/issues/1913
84-
# We verify here if SPDX is not really there as SDPX is part of the license text
85-
# scancode has some problems detecting it properly
86-
with open(os.path.join(os.path.abspath(license_offender['file']['path'])), 'r') as spdx_file_check:
87-
filetext = spdx_file_check.read()
88-
matches = re.findall("SPDX-License-Identifier:?", filetext)
89-
if matches:
90-
continue
91-
license_offender['reason'] = MISSING_SPDX_TEXT
92-
offenders.append(license_offender.copy())
93-
except UnicodeDecodeError:
94-
# not valid file for license check
137+
file_path = os.path.abspath(scancode_output_data_file['path'])
138+
if not has_spdx_text_in_analysed_file(file_path):
139+
scancode_output_data_file['fail_reason'] = MISSING_SPDX_TEXT
140+
offenders.append(scancode_output_data_file)
141+
except FileDecodeError:
142+
# Ignore files that cannot be decoded
143+
# check the next file in the scancode output
95144
continue
96-
except KeyError:
97-
userlog.warning("Invalid scancode json file")
98-
return -1
145+
except KeyError as e:
146+
userlog.warning("Could not find %s attribute in %s" % (str(e), scancode_output))
147+
return -1
99148

100149
if offenders:
101150
userlog.warning("Found files with missing license details, please review and fix")
102151
for offender in offenders:
103-
userlog.warning("File: " + offender['file']['path'][len(directory_name):] + " " + "reason: " + offender['reason'])
152+
userlog.warning("File: %s reason: %s" % (path_leaf(offender['path']), offender['fail_reason']))
104153
return len(offenders)
105154

106-
107-
def spdx_check(offenders, license_offender):
108-
""" Parse through list of licenses to determine whether licenses are permissive
109-
@input list of offender, individual offender dict
110-
@output none
111-
"""
112-
found_spdx = False
113-
# iterate through licenses, stop once permissive license has been found
114-
for i in range(len(license_offender['file']['licenses'])):
115-
# is any of the licenses permissive ?
116-
if license_offender['file']['licenses'][i]['category'] == 'Permissive':
117-
# confirm that it has spdx license key
118-
if license_offender['file']['licenses'][i]['matched_rule']['identifier'].find("spdx") != -1:
119-
found_spdx = True
120-
# if no spdx found return anyway
121-
return found_spdx
122-
# otherwise file is missing permissive license
123-
license_offender['reason'] = MISSING_PERMISIVE_LICENSE_TEXT
124-
offenders.append(license_offender.copy())
125-
126-
# missing spdx and permissive license
127-
return found_spdx
128-
129155
def parse_args():
130156
parser = argparse.ArgumentParser(
131157
description="License check.")
@@ -135,11 +161,11 @@ def parse_args():
135161
help='Directory name where are files being checked')
136162
return parser.parse_args()
137163

138-
139164
if __name__ == "__main__":
165+
140166
args = parse_args()
141167
if args.file and os.path.isfile(args.file):
142-
count = license_check(args.directory_name, args.file)
168+
count = license_check(args.file)
143169
if count == 0:
144170
sys.exit(0)
145171
else:

tools/test/travis-ci/scancode_evaluate_test.py

Lines changed: 86 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -4,29 +4,28 @@
44
# SPDX-License-Identifier: Apache-2.0
55
import importlib
66
import os
7-
import sys
8-
from unittest import TestCase
7+
import pytest
98

109
# TODO: fix scancode to match python naming conventROOTi
11-
SCANCODE_EVALUATE = importlib.import_module("scancode-evaluate")
12-
license_check = SCANCODE_EVALUATE.license_check
10+
license_check = importlib.import_module("scancode-evaluate").license_check
1311

1412
ROOT = os.path.abspath(
15-
os.path.join(os.path.dirname(__file__))
13+
os.path.join(os.path.dirname(__file__))
1614
)
1715

1816
# path to stub files
1917
stub_path = ROOT + "/scancode_test/"
2018

2119
# template copyright notices
22-
invalid_header_1 = "/* Copyright (C) Arm Limited, Inc - All Rights Reserved\
20+
header_without_spdx = "/* Copyright (C) Arm Limited, Inc - All Rights Reserved\
2321
* Unauthorized copying of this. file, via any medium is strictly prohibited\
2422
* Proprietary and confidential\
2523
*/"
2624

27-
invalid_header_2 = "/* mbed Microcontroller Library\
25+
header_with_spdx = "/* mbed Microcontroller Library\
2826
* Copyright (c) 2006-2013 ARM Limited\
2927
*\
28+
* SPDX-License-Identifier: Apache-2.0\
3029
* Licensed under the Apache License, Version 2.0 (the \"License\");\
3130
* you may not use this file except in compliance with the License.\
3231
* You may obtain a copy of the License at\
@@ -40,67 +39,83 @@
4039
* limitations under the License.\
4140
*/"
4241

43-
44-
# implement test class
45-
class TestScancodeEvaluate(TestCase):
46-
""" Test scancode evaluation script """
47-
48-
def test_scancode_case_1(self):
49-
""" Test Case 1 -- faulty json file
50-
@inputs scancode_test_1.json
51-
@outputs -1 if any error in file licenses found
52-
"""
53-
expected_result = -1
54-
test_json = ROOT + "/scancode_test/scancode_test_1.json"
55-
56-
# pass json path to test function
57-
result = license_check(ROOT, test_json)
58-
59-
self.assertEqual(expected_result, result)
60-
61-
def test_scancode_case_2(self):
62-
""" Test Case 2 -- no errors in license headers, try multiple types i.e Apache-2.0, BSD3
63-
@inputs scancode_test_2.json [4 Apache-2.0, 4 BSD-3.0]
64-
@outputs 0
65-
"""
66-
expected_result = 0
67-
test_json = ROOT + "/scancode_test/scancode_test_2.json"
68-
69-
result = license_check(ROOT, test_json)
70-
self.assertEqual(expected_result, result, "False Negative(s)")
71-
72-
def test_scancode_case_3(self):
73-
""" Test Case 3 -- all files containing errors
74-
@inputs scancode_test_3.json [2 no header, 2 non-permissive + spdx, 1 missing SPDX]
75-
@output 5
76-
"""
77-
# create stub files with a non-permissive license and missing spdx
78-
for i in range(3, 6):
79-
with open(stub_path + "test" + str(i) + ".h", "w") as file:
80-
if i == 5:
81-
file.write(invalid_header_2)
82-
else:
83-
file.write(invalid_header_1)
84-
85-
expected_result = 7
86-
test_json = ROOT + "/scancode_test/scancode_test_3.json"
87-
88-
result = license_check(ROOT, test_json)
89-
90-
self.assertEqual(expected_result, result, "False Positive(s)")
91-
# delete stub files
92-
os.remove(stub_path + "test3.h")
93-
os.remove(stub_path + "test4.h")
94-
os.remove(stub_path + "test5.h")
95-
96-
def test_scancode_case_4(self):
97-
""" Test Case 4 -- license header permissive and non-permissive 'license' [FP]
98-
@inputs scancode_test_4.json
99-
@outputs 0
100-
"""
101-
expected_result = 0
102-
test_json = ROOT + "/scancode_test/scancode_test_4.json"
103-
104-
result = license_check(ROOT, test_json)
105-
106-
self.assertEqual(expected_result, result, "Non-Permissive Header False Positive")
42+
@pytest.fixture()
43+
def resource_test3():
44+
"""Create stub files.
45+
test3.h missing license notice
46+
test4.h with license notice
47+
test5.h with license notice
48+
"""
49+
file_paths = [
50+
stub_path + "test3.h",
51+
stub_path + "test4.h",
52+
stub_path + "test5.h",
53+
]
54+
for file_path in file_paths:
55+
with open(file_path, "w") as new_file:
56+
if file_path in [stub_path + "test4.h", stub_path + "test5.h"]:
57+
new_file.write(header_with_spdx)
58+
else:
59+
new_file.write(header_without_spdx)
60+
yield "resource_test3"
61+
# delete stub files
62+
for file_path in file_paths:
63+
os.remove(file_path)
64+
65+
66+
@pytest.fixture()
67+
def resource_test4():
68+
"""Create stub files.
69+
test.h missing license notice
70+
"""
71+
file_paths = [
72+
stub_path + "test.h",
73+
]
74+
for file_path in file_paths:
75+
with open(file_path, "w") as new_file:
76+
new_file.write(header_without_spdx)
77+
yield "resource_test4"
78+
# delete stub files
79+
for file_path in file_paths:
80+
os.remove(file_path)
81+
82+
83+
class TestScancodeEvaluate:
84+
""" Test scancode evaluation script """
85+
def test_scancode_case_1(self):
86+
""" Test Case 1 -- Missing `files` attribute in JSON
87+
@inputs scancode_test/scancode_test_1.json
88+
@outputs -1
89+
"""
90+
scancode_output = ROOT + "/scancode_test/scancode_test_1.json"
91+
assert license_check(scancode_output) == -1
92+
93+
def test_scancode_case_2(self):
94+
""" Test Case 2 -- Various combinations where at least one license in
95+
a file is permissive and has spdx in the match.identifier
96+
attribute.
97+
@inputs scancode_test/scancode_test_2.json
98+
@outputs 0
99+
"""
100+
scancode_output = ROOT + "/scancode_test/scancode_test_2.json"
101+
assert license_check(scancode_output) == 0
102+
103+
def test_scancode_case_3(self, resource_test3):
104+
""" Test Case 3 -- Five file scanned with various issues:
105+
test.h: Missing license text (error count += 1)
106+
test3.h: Missing `Permissive` license text and `spdx` in match.identifier and not in file tested by ScanCode (error count += 2)
107+
test4.h: Missing `Permissive` license text and `spdx` in match.identifier but found in file tested by ScanCode (error count += 1)
108+
test5.h: Missing `spdx` in match.identifier but found in file tested by ScanCode. (error count += 0)
109+
@inputs scancode_test/scancode_test_2.json
110+
@output 5
111+
"""
112+
scancode_output = ROOT + "/scancode_test/scancode_test_3.json"
113+
assert license_check(scancode_output) == 4
114+
115+
def test_scancode_case_4(self, resource_test4):
116+
""" Test Case 4 -- Multiple `Permissive` licenses in one file but none with `spdx` in match.identifier and not in file tested by ScanCode (error count += 1)
117+
@inputs scancode_test/scancode_test_2.json
118+
@outputs 0
119+
"""
120+
scancode_output = ROOT + "/scancode_test/scancode_test_4.json"
121+
assert license_check(scancode_output) == 1, "Non-Permissive Header False Positive"

0 commit comments

Comments
 (0)