diff --git a/danker.sh b/danker.sh index d6f1e87..a90fe5b 100755 --- a/danker.sh +++ b/danker.sh @@ -24,15 +24,15 @@ if [ "$1" == "ALL" ]; then filename='all.links' for i in `./lib/getLanguages.sh`; do ./lib/createLinks.sh "$i" >> all-link-files.txt; done for i in `cat all-link-files.txt`; do cat "$i" >> "$filename"; done - sort --field-separator=$'\t' --key=1 --temporary-directory=. -no "$filename" "$filename" + sort -S 50% --field-separator=$'\t' --key=1 --temporary-directory=. -no "$filename" "$filename" else filename=`./lib/createLinks.sh "$1"` fi if [ "$2" == "BIGMEM" ]; then ./lib/dankerBigMem.py "$filename" $damping_factor $iterations $start_value | sed "s/\(.*\)/Q\1/" > "$filename".rank else - sort --field-separator=$'\t' --key=2 --temporary-directory=. -no "$filename"".right" "$filename" + sort -S 50% --field-separator=$'\t' --key=2 --temporary-directory=. -no "$filename"".right" "$filename" ./lib/danker.py "$filename" "$filename"".right" $damping_factor $iterations $start_value | sed "s/\(.*\)/Q\1/" > "$filename".rank rm "$filename"".right" fi -sort -nro "$filename"".rank" --field-separator=$'\t' --key=2 "$filename"".rank" +sort -S 50% -nro "$filename"".rank" --field-separator=$'\t' --key=2 "$filename"".rank" diff --git a/lib/createLinks.sh b/lib/createLinks.sh index 29c2768..74fc22d 100755 --- a/lib/createLinks.sh +++ b/lib/createLinks.sh @@ -22,7 +22,7 @@ rss="https://dumps.wikimedia.org/""$1""wiki/latest/" ###################### DOWNLOAD AND UNZIP wget -q "$rss""$1""wiki-latest-pagelinks.sql.gz-rss.xml" "$rss""$1""wiki-latest-page_props.sql.gz-rss.xml" "$rss""$1""wiki-latest-page.sql.gz-rss.xml" "$rss""$1""wiki-latest-redirect.sql.gz-rss.xml" -dump_date=`cat *.xml | sed -n "s#^ $download\(.*\)#\1#p" | sort -u | head -n 1` +dump_date=`cat *.xml | sed -n "s#^ $download\(.*\)#\1#p" | sort -S 50% -u | head -n 1` rm *.xml pagelinks="$1""wiki-""$dump_date""-pagelinks.sql" pageprops="$1""wiki-""$dump_date""-page_props.sql" @@ -46,15 +46,15 @@ fi rm "$1"*.sql ###################### JOINS export LC_ALL=C -sort --field-separator=$'\t' --key=2 -o "$1""page.lines" "$1""page.lines" -sort --field-separator=$'\t' --key=2 -o "$1""pagelinks.lines" "$1""pagelinks.lines" +sort -S 50% --field-separator=$'\t' --key=2 -o "$1""page.lines" "$1""page.lines" +sort -S 50% --field-separator=$'\t' --key=2 -o "$1""pagelinks.lines" "$1""pagelinks.lines" join -j 2 "$1""pagelinks.lines" "$1""page.lines" -o 1.1,2.1 -t $'\t' > "$1""pagelinks2.lines" -sort --field-separator=$'\t' --key=2 -o "$1""pagelinks2.lines" "$1""pagelinks2.lines" +sort -S 50% --field-separator=$'\t' --key=2 -o "$1""pagelinks2.lines" "$1""pagelinks2.lines" # take care of redirects (note: 'double redirects' are fixed by bots --> https://en.wikipedia.org/wiki/Wikipedia:Double_redirects) -sort --field-separator=$'\t' --key=2 -o "$1""redirects.lines" "$1""redirects.lines" +sort -S 50% --field-separator=$'\t' --key=2 -o "$1""redirects.lines" "$1""redirects.lines" join -j 2 "$1""redirects.lines" "$1""page.lines" -o 2.1,1.1 -t $'\t' > "$1""redirects2.lines" -sort --field-separator=$'\t' --key=2 -o "$1""redirects2.lines" "$1""redirects2.lines" +sort -S 50% --field-separator=$'\t' --key=2 -o "$1""redirects2.lines" "$1""redirects2.lines" join -j 2 "$1""pagelinks2.lines" "$1""redirects2.lines" -o 1.1,2.1 -t $'\t' > "$1""pagelinks22.lines" # we can write this back to our page links set (potentially duplicating links) @@ -63,11 +63,11 @@ cat "$1""pagelinks22.lines" >> "$1""pagelinks2.lines" # end redirects ###################### GET Q-IDs -sort --field-separator=$'\t' --key=2 -o "$1""pagelinks2.lines" "$1""pagelinks2.lines" -sort --field-separator=$'\t' --key=2 -o "$1""pageprops.lines" "$1""pageprops.lines" +sort -S 50% --field-separator=$'\t' --key=2 -o "$1""pagelinks2.lines" "$1""pagelinks2.lines" +sort -S 50% --field-separator=$'\t' --key=2 -o "$1""pageprops.lines" "$1""pageprops.lines" join -j 2 "$1""pagelinks2.lines" "$1""pageprops.lines" -o 2.1,1.1 -t $'\t' > "$1""pagelinks.lines" -sort --field-separator=$'\t' --key=2 -o "$1""pagelinks.lines" "$1""pagelinks.lines" +sort -S 50% --field-separator=$'\t' --key=2 -o "$1""pagelinks.lines" "$1""pagelinks.lines" join -j 2 "$1""pagelinks.lines" "$1""pageprops.lines" -o 2.1,1.1 -t $'\t' | sed "s/\(Q\|q\)\(.*\)\t\(Q\|q\)\(.*\)/\2\t\4/" > "$1""pagelinks2.lines" -sort "$1""pagelinks2.lines" | uniq > "$1"-"$dump_date"".links" +sort -S 50% -u "$1""pagelinks2.lines" > "$1"-"$dump_date"".links" rm "$1"*.lines echo "$1"-"$dump_date"".links"