Skip to content

Commit 493ce90

Browse files
ggerganovjordankanter
authored andcommitted
scripts : script to get Paul Graham essays in txt format (ggml-org#4838)
1 parent e15b42c commit 493ce90

File tree

1 file changed

+47
-0
lines changed

1 file changed

+47
-0
lines changed

scripts/get-pg.sh

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/bin/bash
2+
3+
function usage {
4+
echo "usage: <n>$0"
5+
exit 1
6+
}
7+
8+
function has_cmd {
9+
if ! [ -x "$(command -v $1)" ]; then
10+
echo "error: $1 is not available" >&2
11+
exit 1
12+
fi
13+
}
14+
15+
# check for: curl, html2text, tail, sed, fmt
16+
has_cmd curl
17+
has_cmd html2text
18+
has_cmd tail
19+
has_cmd sed
20+
21+
if [ $# -ne 1 ]; then
22+
usage
23+
fi
24+
25+
n=$1
26+
27+
# get urls
28+
urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
29+
30+
printf "urls:\n%s\n" "$urls"
31+
32+
if [ -f pg.txt ]; then
33+
rm pg.txt
34+
fi
35+
36+
for url in $urls; do
37+
echo "processing $url"
38+
39+
curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt
40+
41+
# don't flood the server
42+
sleep 1
43+
done
44+
45+
echo "done. data in pg.txt"
46+
47+
exit 0

0 commit comments

Comments
 (0)