Skip to content

Commit 0113530

Browse files
authored
Merge branch 'main' into feature/add-analytics
2 parents 97c6c8a + 74f5999 commit 0113530

File tree

6 files changed

+36
-30
lines changed

6 files changed

+36
-30
lines changed

Pipfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ name = "pypi"
66
[packages]
77
Scrapy = "==2.9.0"
88
selenium = "==4.9.1"
9-
meilisearch = "==0.25.0"
9+
meilisearch = "==0.26.0"
1010
requests-iap = "==0.2.0"
1111
python-keycloak-client = "==0.2.3"
1212
requests = "==2.31.0"
1313

1414
[dev-packages]
1515
pytest = "==7.3.1"
16-
pylint = "==2.15.9"
16+
pylint = "==2.17.4"
1717
tox = "==4.5.2"
1818
tox-pipenv = "==1.10.1"
1919
importlib_metadata = {version = "*", markers="python_version < '3.8'"}

Pipfile.lock

Lines changed: 12 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scraper/src/config/browser_handler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import builtins
12
import re
23
import os
34
from selenium import webdriver
@@ -29,7 +30,7 @@ def init(config_original_content, js_render, user_agent):
2930
CHROMEDRIVER_PATH = os.environ.get('CHROMEDRIVER_PATH',
3031
"/usr/bin/chromedriver")
3132
if not os.path.isfile(CHROMEDRIVER_PATH):
32-
raise Exception(
33+
raise builtins.Exception(
3334
f"Env CHROMEDRIVER_PATH='{CHROMEDRIVER_PATH}' is not a path to a file")
3435
driver = webdriver.Chrome(
3536
CHROMEDRIVER_PATH,
Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11

22

3+
import builtins
4+
5+
36
class ConfigValidator:
47
config = None
58

@@ -18,52 +21,52 @@ def validate(self):
1821
# Start urls must be an array
1922
if self.config.start_urls and not isinstance(self.config.start_urls,
2023
list):
21-
raise Exception('start_urls should be list')
24+
raise builtins.Exception('start_urls should be list')
2225

2326
# Stop urls must be an array
2427
if self.config.stop_urls and not isinstance(self.config.stop_urls,
2528
list):
26-
raise Exception('stop_urls should be list')
29+
raise builtins.Exception('stop_urls should be list')
2730

2831
# Custom settings must be a dict
2932
if self.config.custom_settings and not isinstance(self.config.custom_settings,
3033
dict):
31-
raise Exception('custom_settings must be a dictionary')
34+
raise builtins.Exception('custom_settings must be a dictionary')
3235

3336
if self.config.js_render and not isinstance(self.config.js_render,
3437
bool):
35-
raise Exception('js_render should be boolean')
38+
raise builtins.Exception('js_render should be boolean')
3639

3740
# `js_wait` is set to 0s by default unless it is specified
3841
if self.config.js_wait and not isinstance(self.config.js_wait, int):
39-
raise Exception('js_wait should be integer')
42+
raise builtins.Exception('js_wait should be integer')
4043

4144
if self.config.use_anchors and not isinstance(self.config.use_anchors,
4245
bool):
43-
raise Exception('use_anchors should be boolean')
46+
raise builtins.Exception('use_anchors should be boolean')
4447

4548
if self.config.sitemap_alternate_links and not isinstance(
4649
self.config.sitemap_alternate_links, bool):
47-
raise Exception('sitemap_alternate_links should be boolean')
50+
raise builtins.Exception('sitemap_alternate_links should be boolean')
4851

4952
if self.config.sitemap_urls_regexs and not self.config.sitemap_urls:
50-
raise Exception(
53+
raise builtins.Exception(
5154
'You gave an regex to parse sitemap but you didn\'t provide a sitemap url')
5255

5356
if self.config.sitemap_urls_regexs and not self.config.sitemap_urls:
5457
for regex in self.config.sitemap_urls_regex:
5558
if not isinstance(regex, str):
56-
raise Exception(
59+
raise builtins.Exception(
5760
'You gave an bad regex: ' + regex + ' must be a string')
5861

5962
if self.config.force_sitemap_urls_crawling and not self.config.sitemap_urls:
60-
raise Exception(
63+
raise builtins.Exception(
6164
'You want to force the sitemap crawling but you didn\'t provide a sitemap url')
6265

6366
if not self.config.scrape_start_urls and not self.config.scrap_start_urls:
64-
raise Exception(
67+
raise builtins.Exception(
6568
'Please use only the new variable name: scrape_start_urls')
6669

6770
if self.config.nb_hits_max and not isinstance(self.config.nb_hits_max,
6871
int):
69-
raise Exception('nb_hits_max should be integer')
72+
raise builtins.Exception('nb_hits_max should be integer')

scraper/src/config/selectors_parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import builtins
12
from ..helpers import css_to_xpath
23

34

@@ -27,7 +28,7 @@ def _parse_selectors_set(config_selectors):
2728
# Type
2829
if 'type' in selectors_set[key]:
2930
if selectors_set[key]['type'] not in ['xpath', 'css']:
30-
raise Exception(
31+
raise builtins.Exception(
3132
selectors_set[key][
3233
'type'] + 'is not a good selector type, it should be `xpath` or `css`')
3334
else:

scraper/src/config/urls_parser.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import builtins
12
import re
23
import copy
34

@@ -55,11 +56,11 @@ def parse(config_start_urls):
5556
start_url['variables'][match]['url'],
5657
start_url['variables'][match]['js'])
5758
else:
58-
raise Exception(
59+
raise builtins.Exception(
5960
"Bad arguments for variables." + match + " for url " +
6061
start_url['url'])
6162
else:
62-
raise Exception(
63+
raise builtins.Exception(
6364
"Missing " + match + " in variables" + " for url " +
6465
start_url['url'])
6566

0 commit comments

Comments
 (0)