Skip to content

Commit b9a2071

Browse files
author
David de Hilster
authored
Merge pull request #16 from dehilsterlexis/NLP-TUTORIALS-011
NLP-TUTORIALS-011 Added tutorial 13 (13-a and 13-b)
2 parents df2aa0a + c8a4c8f commit b9a2071

40 files changed

+3089
-0
lines changed

tutorial-13/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# TUTORIAL-13
2+
3+
This tutorial shows of verion 2 of the NLP Engine and VisualText. The main difference is the KB View with the new dictionary and KBB files and the great tools for building and maintaining NLP analyzers that feed one into another.
4+
5+
## Tutorial 13-a
6+
7+
This analyzer fetches informational pages from the web on the 50 American states.
8+
9+
## Tutorial 13-b
10+
11+
This analyzer analyzes the 50 state webpages and creates dictionaries and a knowledge base.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"visualText": [
3+
{
4+
"name": "Analyzer",
5+
"type": "state",
6+
"currentTextFile": "c:\\git\\nlp-tutorials\\tutorial-13\\tutorial-13-a\\input\\urls.txt",
7+
"currentPassFile": "c:\\git\\nlp-tutorials\\tutorial-13\\tutorial-13-a\\spec\\hrefs.nlp"
8+
}
9+
]
10+
}

tutorial-13/tutorial-13-a/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Tutorial 13-a
2+
3+
This analyzer parses the URLs from this link: https://state.1keydata.com/ into a URL list. It then has a python script to fetch the webpages and save them in a folder. This folder then can easily be moved into the second analyzer where the pages will be processed.
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
2+
<a href="alabama.php">Alabama</a>
3+
<br><a href="alaska.php">Alaska</a>
4+
<br><a href="arizona.php">Arizona</a>
5+
<br><a href="arkansas.php">Arkansas</a>
6+
<br><a href="california.php">California</a>
7+
<br><a href="colorado.php">Colorado</a>
8+
<br><a href="connecticut.php">Connecticut</a>
9+
<br><a href="delaware.php">Delaware</a>
10+
<br><a href="florida.php">Florida</a>
11+
<br><a href="georgia.php">Georgia</a>
12+
<br><a href="hawaii.php">Hawaii</a>
13+
<br><a href="idaho.php">Idaho</a>
14+
<br><a href="illinois.php">Illinois</a>
15+
</div>
16+
<div id="col2">
17+
<a href="indiana.php">Indiana</a>
18+
<br><a href="iowa.php">Iowa</a>
19+
<br><a href="kansas.php">Kansas</a>
20+
<br><a href="kentucky.php">Kentucky</a>
21+
<br><a href="louisiana.php">Louisiana</a>
22+
<br><a href="maine.php">Maine</a>
23+
<br><a href="maryland.php">Maryland</a>
24+
<br><a href="massachusetts.php">Massachusetts</a>
25+
<br><a href="michigan.php">Michigan</a>
26+
<br><a href="minnesota.php">Minnesota</a>
27+
<br><a href="mississippi.php">Mississippi</a>
28+
<br><a href="missouri.php">Missouri</a>
29+
<br><a href="montana.php">Montana</a>
30+
</div>
31+
<div id="col3">
32+
<a href="nebraska.php">Nebraska</a>
33+
<br><a href="nevada.php">Nevada</a>
34+
<br><a href="new-hampshire.php">New Hampshire</a>
35+
<br><a href="new-jersey.php">New Jersey</a>
36+
<br><a href="new-mexico.php">New Mexico</a>
37+
<br><a href="new-york.php">New York</a>
38+
<br><a href="north-carolina.php">North Carolina</a>
39+
<br><a href="north-dakota.php">North Dakota</a>
40+
<br><a href="ohio.php">Ohio</a>
41+
<br><a href="oklahoma.php">Oklahoma</a>
42+
<br><a href="oregon.php">Oregon</a>
43+
<br><a href="pennsylvania.php">Pennsylvania</a>
44+
</div>
45+
<div id="col4">
46+
<a href="rhode-island.php">Rhode Island</a>
47+
<br><a href="south-carolina.php">South Carolina</a>
48+
<br><a href="south-dakota.php">South Dakota</a>
49+
<br><a href="tennessee.php">Tennessee</a>
50+
<br><a href="texas.php">Texas</a>
51+
<br><a href="utah.php">Utah</a>
52+
<br><a href="vermont.php">Vermont</a>
53+
<br><a href="virginia.php">Virginia</a>
54+
<br><a href="washington.php">Washington</a>
55+
<br><a href="west-virginia.php">West Virginia</a>
56+
<br><a href="wisconsin.php">Wisconsin</a>
57+
<br><a href="wyoming.php">Wyoming</a>
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import urllib.request
2+
import os
3+
import codecs
4+
from urllib.error import URLError, HTTPError
5+
from bs4 import BeautifulSoup
6+
from pathlib import Path
7+
import re
8+
9+
wordsfile = os.path.join(os.path.dirname(__file__), "urls.txt")
10+
file1 = codecs.open(wordsfile, "r", "utf-8")
11+
lines = file1.readlines()
12+
13+
urlbase = "https://state.1keydata.com/"
14+
15+
count = 0
16+
for url in lines:
17+
url = url.strip()
18+
pieces = url.split("/")
19+
state = pieces[len(pieces)-1]
20+
filepath = state.replace(" ","_").replace(".php","")
21+
htmlDir = os.path.join(os.path.dirname(__file__),"html")
22+
if not os.path.exists(htmlDir):
23+
os.makedirs(htmlDir)
24+
statefile = os.path.join(htmlDir,filepath + ".html")
25+
print(statefile, end =" ")
26+
27+
if os.path.exists(statefile):
28+
print (' exists')
29+
continue
30+
31+
found = False
32+
33+
try:
34+
page = urllib.request.urlopen(url)
35+
except HTTPError as e:
36+
print(' Error code: ', e.code)
37+
file1 = open(os.path.join(os.path.dirname(__file__), "urlorphans.txt"), "a")
38+
file1.write(url + "\n")
39+
file1.close()
40+
except URLError as e:
41+
print('Reason: ', e.reason)
42+
else:
43+
found = True
44+
45+
if found == True:
46+
pagehtml = page.read()
47+
soup = BeautifulSoup(pagehtml, 'html.parser')
48+
body = soup.find('body')
49+
50+
if body:
51+
print(' DOWNLOADED')
52+
file = codecs.open(statefile, "w", "utf-8")
53+
file.write(url + '\n' + str(body))
54+
file.close()
55+
else:
56+
print(' no defs')
57+
file2 = open(os.path.join("input","nobody.txt"), "a")
58+
file2.write(url + "\n")
59+
file2.close()

0 commit comments

Comments
 (0)