Skip to content

Commit 9a818f7

Browse files
committed
scripts : improve get-pg.sh (#4838)
1 parent 18adb4e commit 9a818f7

File tree

1 file changed

+24
-1
lines changed

1 file changed

+24
-1
lines changed

scripts/get-pg.sh

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,22 @@
22

33
function usage {
44
echo "usage: <n>$0"
5+
echo "note: n is the number of essays to download"
6+
echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
7+
echo "n | tokens"
8+
echo "--- | ---"
9+
echo "1 | 6230"
10+
echo "2 | 23619"
11+
echo "5 | 25859"
12+
echo "10 | 36888"
13+
echo "15 | 50188"
14+
echo "20 | 59094"
15+
echo "25 | 88764"
16+
echo "30 | 103121"
17+
echo "32 | 108338"
18+
echo "35 | 113403"
19+
echo "40 | 127699"
20+
echo "45 | 135896"
521
exit 1
622
}
723

@@ -33,10 +49,17 @@ if [ -f pg.txt ]; then
3349
rm pg.txt
3450
fi
3551

52+
c=1
3653
for url in $urls; do
3754
echo "processing $url"
3855

39-
curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt
56+
cc=$(printf "%03d" $c)
57+
58+
curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
59+
cat pg-$cc-one.txt >> pg.txt
60+
61+
cp -v pg.txt pg-$cc-all.txt
62+
c=$((c+1))
4063

4164
# don't flood the server
4265
sleep 1

0 commit comments

Comments
 (0)