# Translates data from the old SVM-HMM format of V2.xx to the new # format of SVM-HMM V3.xx and later # Author: Thorsten Joachims # Date: 12.02.2008 # # Usage: translate_HMM_dataset.pl labelfile datafile newdatafile # # Collects the labels occuring in labelfile (i.e. an SVM-hmm file in the old format) and assigns an integer tag ID starting with number 1 to each label that occurs at least once. Then it translates datafile (again, an SVM-hmm file in the old format) by replacing the labels with the integer tag IDs. It also replaces the qid by just the integer value, using the decimal value to sort the tokens. The new format assumes that the tokens are in consecutive order. ($labelfile,$datafile,$newdatafile)=@ARGV; open(L,$labelfile) || die("Could not open '$labelfile'!\n"); while($l=) { if($l!~/^#/) { # skip comment lines $l=~/^(\S+)\s*/; $labels{$1}=1; } } close(L); printf("Using the following mapping from labels to label IDs:\n"); $id=0; for $l (sort keys(%labels)) { $id++; $labels{$l}=$id; printf("$l\t --> $id\n"); } printf "Reading old data file ..."; open(D,$datafile) || die("Could not open '$datafile'!\n"); while($l=) { chop $l; if($l!~/^#/) { # skip comment lines $l=~/^(\S+)\s+qid:(\d+)\.(\d+)\s+(.*)$/ || die("Could not parse '$l'!"); if(!$labels{$1}) { printf("WARNING: The label '$1' was found in the datafile, but not in the labelfile.\n"); $labels{$1}=scalar(keys(%labels))+1; printf("$1\t --> $labels{$1}\n"); } ${$example{$2}}{$3}="$labels{$1} qid:$2 $4"; } } close(D); printf "done.\n"; printf "Writing new data file ..."; open(O,">".$newdatafile) || die("Could not open '$newdatafile'!\n"); for $exnum (sort {$a+0 <=> $b+0} keys(%example)) { $ex=$example{$exnum}; @ex=sort {$a+0 <=> $b+0} keys(%{$ex}); for $i (@ex) { print O ${$ex}{$i}."\n"; } } close(O); printf "done.\n";