-
-
Notifications
You must be signed in to change notification settings - Fork 46.9k
Create emails_from_url.py #1756
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
0f96b74
090cd66
f86fd16
df2938d
13ec4e5
a0af0fd
47d15bb
f0f0118
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
"""Get the site emails from URL.""" | ||
__author__ = "Muhammad Umer Farooq" | ||
__license__ = "MIT" | ||
__version__ = "1.0.0" | ||
__maintainer__ = "Muhammad Umer Farooq" | ||
__email__ = "[email protected]" | ||
__status__ = "Production" | ||
|
||
import re | ||
import requests | ||
from urllib import parse | ||
from html.parser import HTMLParser | ||
|
||
|
||
class Parser(HTMLParser): | ||
''' class use to parse HTML | ||
handle_starttag function takes url from | ||
anchor tag | ||
''' | ||
def __init__(self, domain): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add type hints as discussed in CONTRIBUTING.md. |
||
HTMLParser.__init__(self) | ||
self.data = [] | ||
self.domain = domain | ||
|
||
def handle_starttag(self, tag, attrs): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add type hints as discussed in CONTRIBUTING.md. |
||
# Only parse the 'anchor' tag. | ||
if tag == "a": | ||
# Check the list of defined attributes. | ||
for name, value in attrs: | ||
# If href is defined, and not empty nor # print it. | ||
if name == "href" and value != "#" and value != '': | ||
# If not already in data. | ||
if value not in self.data: | ||
url = parse.urljoin(self.domain, value) | ||
self.data.append(url) | ||
|
||
|
||
# Get main domain name (example.com) | ||
def get_domain_name(url): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add type hints and doctests as discussed in CONTRIBUTING.md. |
||
return '.'.join(get_sub_domain_name(url).split('.')[-2:]) | ||
|
||
|
||
# Get sub domain name (sub.example.com) | ||
def get_sub_domain_name(url): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add type hints and doctests as discussed in CONTRIBUTING.md. |
||
return parse.urlparse(url).netloc | ||
lablnet marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if __name__ == '__main__': | ||
# Get the url | ||
url = "https://github.com" | ||
# Get the base domain from the url | ||
domain = get_domain_name(url) | ||
|
||
# Initialize the parser | ||
parser = Parser(domain) | ||
|
||
# Validate Email regx. | ||
emailRegx = '[a-zA-Z0-9]+@' + domain | ||
|
||
# Open URL | ||
r = requests.get(url) | ||
|
||
# pass the raw HTML to the parser to get links | ||
parser.feed(r.text) | ||
|
||
# Store Email Data structure. | ||
Emails = [] | ||
# Get links and loop through | ||
for link in parser.data: | ||
# open URL. | ||
# read = requests.get(link) | ||
read = requests.get(link) | ||
# Get the valid email. | ||
email = re.findall(emailRegx, read.text) | ||
# If not in list then append it. | ||
if email not in Emails: | ||
Emails.append(email) | ||
|
||
ValidEmails = [] | ||
|
||
# Remove duplicates email address. | ||
for Email in Emails: | ||
for e in Email: | ||
if e not in ValidEmails: | ||
ValidEmails.append(e) | ||
|
||
# Finally print list of email. | ||
print(ValidEmails) |
Uh oh!
There was an error while loading. Please reload this page.