User:Rab/Beowulf commands


 * 1) ! /bin/sh
 * 2) create a tarlist from wikipedia tar archive
 * 3) 1/20/08
 * 4) assumes a named pipe called via on mist

hadoop fs -cat /data/wiki/static-tar/wikipedia-en-html.tar | tar tvf - > via # very slow bottleneck?


 * 1) in another window, as root:
 * 2) hadoop fs -put via /data/wiki/static-tar/wikipedia-en-html.tarlist


 * 1) dump to stdout, one file per line:
 * 2) tar xf trytar.tar --to-command 'tr "\012" " " ; echo ""'


 * 1) hadoop fs -cat /data/wiki/static-tar/wikipedia-en-html.tarlist | egrep  -v '~' | awk 'NR%10000 == 2 {print $NF; count++} count==100 {exit}' > sample.100


 * 1) hadoop fs -cat /data/wiki/static-tar/wikipedia-en-html.tar | tar xf - -T sample.100


 * 1) awk '{print NR, $0}' names_* | join -j 1 names.pre* - | sed -e 's/[^ ]* //' -e 's/ //' -e 's/ /_/g' > names.lis


 * 1) cut -c 1-3 names_* | tr A-Z a-z | sed 's,\(.\)\(.\)\(.\),en/articles/\1/\2/\3/,' | awk '{print NR, $0}' > names.prefix.n

35 awk '{print $0 ".html", "n"}' names.lis > names.lis_n 37 awk '{print $NF, "t"}' *.tarlist  > tarlist_t 38 sort tarlist_t names.lis_n | awk '$2 == "n" && saved != "" {print saved; saved = $0 ; next} $2 == "n"{saved = $0; next} saved != "" { print saved; print ; saved = ""}' > pairs.out awk '$2 == "n" && saved != "" { saved = $1 ; next} $2 == "n" { saved = $1 ; next} saved != "" { print $1 ; saved = ""; next} ' pairs.out > sample.1000
 * 1) find page names that may differ only by an added suffix:

tar xfz *.tar.gz --to-command 'tr "\012" " " ; echo ""' -T sample.1000 > wp-1000.dat

gunzip -c wp-all.dat.gz | wc -l

gunzip -c wp-all.dat_orig.gz | ./insert-name sample.all - | gzip > wp-all.dat.gz gunzip -c wp-all.dat.gz | csplit -k - 120000000000 {\*}

gunzip -c wp-all.dat.gz | awk 'NR >= 100000 {exit} {print}' > wp-100000.dat

gunzip -c wp-all.dat.gz | awk 'NR%10000 == 2 {print; count++} count==100 {exit}' > wp-100a.dat

csplit -k -f wp-all.dat wp-all.dat 1000000

h01.helios.public.stolaf.edu$ time scp mist.public.stolaf.edu:\~rab/lab10/wikipedia-en-html.tar. ; time hadoop fs -put wikipedia-en-html.tar /data/wiki [root@h01 rab]# time cat wp-all.dat | ssh mist.public.stolaf.edu ssh class01.mist.public.stolaf.edu cat \\\> /data/rab/wp-all.dat