-
-
Notifications
You must be signed in to change notification settings - Fork 46.9k
Create emails_from_url.py #1756
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
0f96b74
090cd66
f86fd16
df2938d
13ec4e5
a0af0fd
47d15bb
f0f0118
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
"""Get the site emails from URL.""" | ||
__author__ = "Muhammad Umer Farooq" | ||
__license__ = "MIT" | ||
__version__ = "1.0.0" | ||
__maintainer__ = "Muhammad Umer Farooq" | ||
__email__ = "[email protected]" | ||
__status__ = "Production" | ||
|
||
from html.parser import HTMLParser | ||
import requests | ||
import re | ||
from urllib.parse import urlparse | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's drop this line and then below change the call to |
||
|
||
from urllib import parse | ||
|
||
|
||
class Parser(HTMLParser): | ||
|
||
def __init__(self, domain): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add type hints as discussed in CONTRIBUTING.md. |
||
HTMLParser.__init__(self) | ||
self.data = [] | ||
self.domain = domain | ||
|
||
def handle_starttag(self, tag, attrs): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add type hints as discussed in CONTRIBUTING.md. |
||
# Only parse the 'anchor' tag. | ||
if tag == "a": | ||
# Check the list of defined attributes. | ||
for name, value in attrs: | ||
# If href is defined, and not empty nor # print it. | ||
if name == "href" and value != "#" and value != '': | ||
# If not already in data. | ||
if value not in self.data: | ||
url = parse.urljoin(self.domain, value) | ||
self.data.append(url) | ||
|
||
|
||
# Get main domain name (example.com) | ||
def get_domain_name(url): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add type hints and doctests as discussed in CONTRIBUTING.md. |
||
try: | ||
u = get_sub_domain_name(url).split('.') | ||
return u[-2] + '.' + u[-1] | ||
lablnet marked this conversation as resolved.
Show resolved
Hide resolved
|
||
except: | ||
lablnet marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return "" | ||
|
||
|
||
# Get sub domain name (sub.example.com) | ||
def get_sub_domain_name(url): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add type hints and doctests as discussed in CONTRIBUTING.md. |
||
try: | ||
return urlparse(url).netloc | ||
except: | ||
lablnet marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return '' | ||
|
||
# Get the url | ||
url = "https://github.com" | ||
# Get the base domain from the url | ||
domain = get_domain_name(url) | ||
|
||
# Initialize the parser | ||
parser = Parser(domain) | ||
|
||
# Validate Email regx. | ||
emailRegx = '[a-zA-Z0-9]+@' + domain | ||
lablnet marked this conversation as resolved.
Show resolved
Hide resolved
|
||
try: | ||
# Open URL | ||
r = requests.get(url) | ||
except: | ||
lablnet marked this conversation as resolved.
Show resolved
Hide resolved
|
||
print("Please provide the valid url") | ||
|
||
# pass the raw HTML to the parser to get links | ||
parser.feed(r.text) | ||
|
||
# Store Email Data structure. | ||
Emails = [] | ||
lablnet marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# Get links and loop through | ||
for link in parser.data: | ||
# open URL. | ||
# read = requests.get(link) | ||
try: | ||
read = requests.get(link) | ||
# Get the valid email. | ||
email = re.findall(emailRegx, read.text) | ||
# If not in list then append it. | ||
if email not in Emails: | ||
Emails.append(email) | ||
except: | ||
lablnet marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pass | ||
|
||
ValidEmails = [] | ||
lablnet marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Remove duplicates email address. | ||
for Email in Emails: | ||
for e in Email: | ||
lablnet marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if e not in ValidEmails: | ||
ValidEmails.append(e) | ||
|
||
# Finally print list of email. | ||
print(ValidEmails) |
Uh oh!
There was an error while loading. Please reload this page.