[kumapower:research_on/brown_lob/data] marco% pwd /Users/marco/Desktop/MarcoKuma/research_on/brown_lob/data We create a (more or less) cleaned-up version of LOB: [kumapower:research_on/brown_lob/data] marco% perl -ne 'if (/\*\*\[[0-9]+ TEXT (.+)\*\*\]/){print "CURRTEXT_$1\n";next} if ((/\*\*\[/)||(/\*#/)) {next} s/^[^\s]+\s+[^\s]+\s+//g; s/[\|\^\*\{\}]//g;s/\\0//g;s/([A-Za-z\)])[0-9]/$1/g;s/[0-9]([A-Za-z\("])/$1/g;s/^<[0-9]*//;s/>$//; print' /Volumes/ICAME/UNIX/LOBUNTAG/LOB_* > lob.txt [kumapower:research_on/brown_lob/data] marco% wc lob.txt 99932 1005539 5817523 lob.txt Same for Brown: [kumapower:research_on/brown_lob/data] marco% perl -ne 'chomp;$curr=$_; $curr =~ s/^([^\s]+).*/$1/;if($curr ne $prev){print "CURRTEXT_$curr\n";$prev=$curr} s/^[^\s]+\s+[^\s]+\s+(.*)/$1/;print;print "\n"' /Volumes/ICAME/UNIX/BROWN1/BROWN1_* > brown.txt [kumapower:research_on/brown_lob/data] marco% wc brown.txt 92291 1015440 5998341 brown.txt Tokenizing, one word per line: [kumapower:research_on/brown_lob/data] marco% perl -ne 'if (/CURRTEXT/){print;next} s/\s+/\n/g; s/\b\S*[0-9]\S*\b//g; s/[^\na-zA-Z\-\x27]//g; s/(^|\n)[\x27\-]+/\n/g; s/[\x27\-]+(\n|$)/\n/g;print' lob.txt | egrep "[a-zA-Z]" > lob.tok [kumapower:research_on/brown_lob/data] marco% wc lob.tok 993905 993905 5612875 lob.tok [kumapower:research_on/brown_lob/data] marco% perl -ne 'if (/CURRTEXT/){print;next} s/\s+/\n/g; s/\b\S*[0-9]\S*\b//g; s/[^\na-zA-Z\-\x27]//g; s/(^|\n)[\x27\-]+/\n/g; s/[\x27\-]+(\n|$)/\n/g;print' brown.txt | egrep "[a-zA-Z]" > brown.tok [kumapower:research_on/brown_lob/data] marco% wc brown.tok 1005505 1005505 5771442 brown.tok Token frequencies: [kumapower:research_on/brown_lob/data] marco% grep -v CURRTEXT lob.tok | sort | uniq -c | gawk '{print $2,$1}' > lob_tok_fqs [kumapower:research_on/brown_lob/data] marco% wc lob_tok_fqs 53823 107646 598084 lob_tok_fqs [kumapower:research_on/brown_lob/data] marco% grep -v CURRTEXT brown.tok | sort | uniq -c | gawk '{print $2,$1}' > brown_tok_fqs [kumapower:research_on/brown_lob/data] marco% wc brown_tok_fqs 55734 111468 624486 brown_tok_fqs Document frequencies: [kumapower:research_on/brown_lob/data] marco% ../../lucia_comps/scripts/doc_delimited_uniq.pl "CURRTEXT" lob.tok | grep -v CURRTEXT | sort | uniq -c | gawk '{print $2,$1}' > lob_doc_fqs [kumapower:research_on/brown_lob/data] marco% wc lob_doc_fqs 53823 107646 595613 lob_doc_fqs [kumapower:research_on/brown_lob/data] marco% ../../lucia_comps/scripts/doc_delimited_uniq.pl "CURRTEXT" brown.tok | grep -v CURRTEXT | sort | uniq -c | gawk '{print $2,$1}' > brown_doc_fqs [kumapower:research_on/brown_lob/data] marco% wc brown_doc_fqs 55734 111468 621835 brown_doc_fqs