Merge pull request #276 from python/master

sthagen · web-flow · commit 8889bb1d1541 · 2020-02-28T19:15:15.000+01:00
Sync Fork from Upstream Repo
diff --git a/.azure-pipelines/posix-steps.yml b/.azure-pipelines/posix-steps.yml
@@ -49,7 +49,7 @@ steps:
   - script: ./venv/bin/python -m coverage xml
     displayName: 'Generate coverage.xml'
 
-  - script: source ./venv/bin/activate && bash <(curl -s https://codecov.io/bash)
+  - script: source ./venv/bin/activate && bash <(curl -s https://codecov.io/bash) -y .github/codecov.yml
     displayName: 'Publish code coverage results'
 
 
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
@@ -65,7 +65,7 @@ jobs:
     - name: 'Publish code coverage results'
       run: |
         source ./.venv/bin/activate
-        bash <(curl -s https://codecov.io/bash)
+        bash <(curl -s https://codecov.io/bash) -y .github/codecov.yml
       env:
         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
@@ -84,6 +84,6 @@ jobs:
       if: always()
       run: |
         make pythoninfo
-        bash <(curl -s https://codecov.io/bash)
+        bash <(curl -s https://codecov.io/bash) -y .github/codecov.yml
       env:
         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.travis.yml b/.travis.yml
@@ -94,7 +94,7 @@ matrix:
       after_script:  # Probably should be after_success once test suite updated to run under coverage.py.
         # Make the `coverage` command available to Codecov w/ a version of Python that can parse all source files.
         - source ./venv/bin/activate
-        - bash <(curl -s https://codecov.io/bash)
+        - bash <(curl -s https://codecov.io/bash) -y .github/codecov.yml
     - name: "Test code coverage (C)"
       os: linux
       language: c
@@ -111,7 +111,7 @@ matrix:
         - xvfb-run make -j4 coverage-report
       after_script:  # Probably should be after_success once test suite updated to run under coverage.py.
         - make pythoninfo
-        - bash <(curl -s https://codecov.io/bash)
+        - bash <(curl -s https://codecov.io/bash) -y .github/codecov.yml
 
 
 before_install:
diff --git a/Lib/pkgutil.py b/Lib/pkgutil.py
@@ -638,8 +638,8 @@ def get_data(package, resource):
     return loader.get_data(resource_name)
 
 
-_DOTTED_WORDS = r'[a-z_]\w*(\.[a-z_]\w*)*'
-_NAME_PATTERN = re.compile(f'^({_DOTTED_WORDS})(:({_DOTTED_WORDS})?)?$', re.I)
+_DOTTED_WORDS = r'(?!\d)(\w+)(\.(?!\d)(\w+))*'
+_NAME_PATTERN = re.compile(f'^(?P<pkg>{_DOTTED_WORDS})(?P<cln>:(?P<obj>{_DOTTED_WORDS})?)?$', re.U)
 del _DOTTED_WORDS
 
 def resolve_name(name):
@@ -677,11 +677,12 @@ def resolve_name(name):
     m = _NAME_PATTERN.match(name)
     if not m:
         raise ValueError(f'invalid format: {name!r}')
-    groups = m.groups()
-    if groups[2]:
+    gd = m.groupdict()
+    if gd.get('cln'):
         # there is a colon - a one-step import is all that's needed
-        mod = importlib.import_module(groups[0])
-        parts = groups[3].split('.') if groups[3] else []
+        mod = importlib.import_module(gd['pkg'])
+        parts = gd.get('obj')
+        parts = parts.split('.') if parts else []
     else:
         # no colon - have to iterate to find the package boundary
         parts = name.split('.')
diff --git a/Lib/test/test_pkgutil.py b/Lib/test/test_pkgutil.py
@@ -229,8 +229,40 @@ def test_name_resolution(self):
             ('logging.handlers:SysLogHandler.NO_SUCH_VALUE', AttributeError),
             ('logging.handlers.SysLogHandler.NO_SUCH_VALUE', AttributeError),
             ('ZeroDivisionError', ImportError),
+            ('os.path.9abc', ValueError),
+            ('9abc', ValueError),
         )
 
+        # add some Unicode package names to the mix.
+
+        unicode_words = ('\u0935\u092e\u0938',
+                         '\xe9', '\xc8',
+                         '\uc548\ub155\ud558\uc138\uc694',
+                         '\u3055\u3088\u306a\u3089',
+                         '\u3042\u308a\u304c\u3068\u3046',
+                         '\u0425\u043e\u0440\u043e\u0448\u043e',
+                         '\u0441\u043f\u0430\u0441\u0438\u0431\u043e',
+                         '\u73b0\u4ee3\u6c49\u8bed\u5e38\u7528\u5b57\u8868')
+
+        for uw in unicode_words:
+            d = os.path.join(self.dirname, uw)
+            os.makedirs(d, exist_ok=True)
+            # make an empty __init__.py file
+            f = os.path.join(d, '__init__.py')
+            with open(f, 'w') as f:
+                f.write('')
+                f.flush()
+            # now import the package we just created; clearing the caches is
+            # needed, otherwise the newly created package isn't found
+            importlib.invalidate_caches()
+            mod = importlib.import_module(uw)
+            success_cases += (uw, mod),
+            if len(uw) > 1:
+                failure_cases += (uw[:-1], ImportError),
+
+        # add an example with a Unicode digit at the start
+        failure_cases += ('\u0966\u0935\u092e\u0938', ValueError),
+
         for s, expected in success_cases:
             with self.subTest(s=s):
                 o = pkgutil.resolve_name(s)
diff --git a/Modules/_xxtestfuzz/fuzz_struct_unpack_corpus/hello_string b/Modules/_xxtestfuzz/fuzz_struct_unpack_corpus/hello_string
diff --git a/Modules/_xxtestfuzz/fuzz_struct_unpack_corpus/long_zero b/Modules/_xxtestfuzz/fuzz_struct_unpack_corpus/long_zero
diff --git a/Modules/_xxtestfuzz/fuzz_struct_unpack_corpus/varied_format_string b/Modules/_xxtestfuzz/fuzz_struct_unpack_corpus/varied_format_string
diff --git a/Modules/_xxtestfuzz/fuzz_tests.txt b/Modules/_xxtestfuzz/fuzz_tests.txt
@@ -5,3 +5,4 @@ fuzz_json_loads
 fuzz_sre_compile
 fuzz_sre_match
 fuzz_csv_reader
+fuzz_struct_unpack
diff --git a/Modules/_xxtestfuzz/fuzzer.c b/Modules/_xxtestfuzz/fuzzer.c
@@ -79,6 +79,69 @@ static int fuzz_builtin_unicode(const char* data, size_t size) {
     return 0;
 }
 
+
+PyObject* struct_unpack_method = NULL;
+PyObject* struct_error = NULL;
+/* Called by LLVMFuzzerTestOneInput for initialization */
+static int init_struct_unpack() {
+    /* Import struct.unpack */
+    PyObject* struct_module = PyImport_ImportModule("struct");
+    if (struct_module == NULL) {
+        return 0;
+    }
+    struct_error = PyObject_GetAttrString(struct_module, "error");
+    if (struct_error == NULL) {
+        return 0;
+    }
+    struct_unpack_method = PyObject_GetAttrString(struct_module, "unpack");
+    return struct_unpack_method != NULL;
+}
+/* Fuzz struct.unpack(x, y) */
+static int fuzz_struct_unpack(const char* data, size_t size) {
+    /* Everything up to the first null byte is considered the
+       format. Everything after is the buffer */
+    const char* first_null = memchr(data, '\0', size);
+    if (first_null == NULL) {
+        return 0;
+    }
+
+    size_t format_length = first_null - data;
+    size_t buffer_length = size - format_length - 1;
+
+    PyObject* pattern = PyBytes_FromStringAndSize(data, format_length);
+    if (pattern == NULL) {
+        return 0;
+    }
+    PyObject* buffer = PyBytes_FromStringAndSize(first_null + 1, buffer_length);
+    if (buffer == NULL) {
+        Py_DECREF(pattern);
+        return 0;
+    }
+
+    PyObject* unpacked = PyObject_CallFunctionObjArgs(
+        struct_unpack_method, pattern, buffer, NULL);
+    /* Ignore any overflow errors, these are easily triggered accidentally */
+    if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_OverflowError)) {
+        PyErr_Clear();
+    }
+    /* The pascal format string will throw a negative size when passing 0
+       like: struct.unpack('0p', b'') */
+    if (unpacked == NULL && PyErr_ExceptionMatches(PyExc_SystemError)) {
+        PyErr_Clear();
+    }
+    /* Ignore any struct.error exceptions, these can be caused by invalid
+       formats or incomplete buffers both of which are common. */
+    if (unpacked == NULL && PyErr_ExceptionMatches(struct_error)) {
+        PyErr_Clear();
+    }
+
+    Py_XDECREF(unpacked);
+    Py_DECREF(pattern);
+    Py_DECREF(buffer);
+    return 0;
+}
+
+
 #define MAX_JSON_TEST_SIZE 0x10000
 
 PyObject* json_loads_method = NULL;
@@ -190,9 +253,10 @@ static int fuzz_sre_compile(const char* data, size_t size) {
         PyErr_Clear();
     }
     /* Ignore some common errors thrown by sre_parse:
-       Overflow, Assertion and Index */
+       Overflow, Assertion, Recursion and Index */
     if (compiled == NULL && (PyErr_ExceptionMatches(PyExc_OverflowError) ||
                              PyErr_ExceptionMatches(PyExc_AssertionError) ||
+                             PyErr_ExceptionMatches(PyExc_RecursionError) ||
                              PyErr_ExceptionMatches(PyExc_IndexError))
     ) {
         PyErr_Clear();
@@ -378,6 +442,16 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_builtin_unicode)
     rv |= _run_fuzz(data, size, fuzz_builtin_unicode);
 #endif
+#if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_struct_unpack)
+    static int STRUCT_UNPACK_INITIALIZED = 0;
+    if (!STRUCT_UNPACK_INITIALIZED && !init_struct_unpack()) {
+        PyErr_Print();
+        abort();
+    } else {
+        STRUCT_UNPACK_INITIALIZED = 1;
+    }
+    rv |= _run_fuzz(data, size, fuzz_struct_unpack);
+#endif
 #if !defined(_Py_FUZZ_ONE) || defined(_Py_FUZZ_fuzz_json_loads)
     static int JSON_LOADS_INITIALIZED = 0;
     if (!JSON_LOADS_INITIALIZED && !init_json_loads()) {
diff --git a/Parser/parsetok.c b/Parser/parsetok.c
@@ -240,7 +240,7 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
 #endif
 
     for (;;) {
-        char *a, *b;
+        const char *a, *b;
         int type;
         size_t len;
         char *str;
@@ -371,7 +371,7 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
            buffer after parsing.  Trailing whitespace and comments
            are OK.  */
         if (err_ret->error == E_DONE && start == single_input) {
-            char *cur = tok->cur;
+            const char *cur = tok->cur;
             char c = *tok->cur;
 
             for (;;) {
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
@@ -59,7 +59,9 @@ tok_new(void)
                                             sizeof(struct tok_state));
     if (tok == NULL)
         return NULL;
-    tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
+    tok->buf = tok->cur = tok->inp = NULL;
+    tok->start = NULL;
+    tok->end = NULL;
     tok->done = E_OK;
     tok->fp = NULL;
     tok->input = NULL;
@@ -111,7 +113,9 @@ error_ret(struct tok_state *tok) /* XXX */
     tok->decoding_erred = 1;
     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
         PyMem_FREE(tok->buf);
-    tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
+    tok->buf = tok->cur = tok->inp = NULL;
+    tok->start = NULL;
+    tok->end = NULL;
     tok->done = E_DECODE;
     return NULL;                /* as if it were EOF */
 }
@@ -664,11 +668,11 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
    Look for encoding declarations inside STR, and record them
    inside TOK.  */
 
-static const char *
+static char *
 decode_str(const char *input, int single, struct tok_state *tok)
 {
     PyObject* utf8 = NULL;
-    const char *str;
+    char *str;
     const char *s;
     const char *newl[2] = {NULL, NULL};
     int lineno = 0;
@@ -726,43 +730,46 @@ struct tok_state *
 PyTokenizer_FromString(const char *str, int exec_input)
 {
     struct tok_state *tok = tok_new();
+    char *decoded;
+
     if (tok == NULL)
         return NULL;
-    str = decode_str(str, exec_input, tok);
-    if (str == NULL) {
+    decoded = decode_str(str, exec_input, tok);
+    if (decoded == NULL) {
         PyTokenizer_Free(tok);
         return NULL;
     }
 
-    /* XXX: constify members. */
-    tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
+    tok->buf = tok->cur = tok->inp = decoded;
+    tok->end = decoded;
     return tok;
 }
 
 struct tok_state *
 PyTokenizer_FromUTF8(const char *str, int exec_input)
 {
     struct tok_state *tok = tok_new();
+    char *translated;
     if (tok == NULL)
         return NULL;
-    tok->input = str = translate_newlines(str, exec_input, tok);
-    if (str == NULL) {
+    tok->input = translated = translate_newlines(str, exec_input, tok);
+    if (translated == NULL) {
         PyTokenizer_Free(tok);
         return NULL;
     }
     tok->decoding_state = STATE_RAW;
     tok->read_coding_spec = 1;
     tok->enc = NULL;
-    tok->str = str;
+    tok->str = translated;
     tok->encoding = (char *)PyMem_MALLOC(6);
     if (!tok->encoding) {
         PyTokenizer_Free(tok);
         return NULL;
     }
     strcpy(tok->encoding, "utf-8");
 
-    /* XXX: constify members. */
-    tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
+    tok->buf = tok->cur = tok->inp = translated;
+    tok->end = translated;
     return tok;
 }
 
@@ -812,7 +819,7 @@ PyTokenizer_Free(struct tok_state *tok)
     if (tok->fp != NULL && tok->buf != NULL)
         PyMem_FREE(tok->buf);
     if (tok->input)
-        PyMem_FREE((char *)tok->input);
+        PyMem_FREE(tok->input);
     PyMem_FREE(tok);
 }
 
@@ -1138,7 +1145,7 @@ tok_decimal_tail(struct tok_state *tok)
 /* Get next token, after space stripping etc. */
 
 static int
-tok_get(struct tok_state *tok, char **p_start, char **p_end)
+tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
 {
     int c;
     int blankline, nonascii;
@@ -1321,7 +1328,7 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
                          && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
 
                 if (is_type_ignore) {
-                    *p_start = (char *) ignore_end;
+                    *p_start = ignore_end;
                     *p_end = tok->cur;
 
                     /* If this type ignore is the only thing on the line, consume the newline also. */
@@ -1331,7 +1338,7 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
                     }
                     return TYPE_IGNORE;
                 } else {
-                    *p_start = (char *) type_start;  /* after type_comment_prefix */
+                    *p_start = type_start;  /* after type_comment_prefix */
                     *p_end = tok->cur;
                     return TYPE_COMMENT;
                 }
@@ -1410,7 +1417,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
                    Look ahead one token to see if that is 'def'. */
 
                 struct tok_state ahead_tok;
-                char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
+                const char *ahead_tok_start = NULL;
+                const char *ahead_tok_end = NULL;
                 int ahead_tok_kind;
 
                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
@@ -1798,7 +1806,7 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
 }
 
 int
-PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
+PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
 {
     int result = tok_get(tok, p_start, p_end);
     if (tok->decoding_erred) {
@@ -1823,7 +1831,9 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
 {
     struct tok_state *tok;
     FILE *fp;
-    char *p_start =NULL , *p_end =NULL , *encoding = NULL;
+    const char *p_start = NULL;
+    const char *p_end = NULL;
+    char *encoding = NULL;
 
     fd = _Py_dup(fd);
     if (fd < 0) {
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h