#!/usr/bin/perl #/*WordMake 0.1: Dictionary Maker based on text files input*/ #/*Takes as input raw text files an parses them as dictionary files*/ #/*Features: # Select as many raw texts as you like*/ # Choose words between a specified long*/ # Choose an output File */ # Sorts all the words, makes them lowercase and unique*/ # #/*That's all I was looking in order */ #/*to make a better Spanish Dictionary */ #/*If you find it useful, drop me a line and if */ #/*you want send me your dictionary */ #/*ToDO: Better cmd line parsing, more regular expresion filters, etc*/ #/*Of course if you make it better, let me know */ #Author: Linga #Date: 25-05-00 #Usage: ./wordmake.pl TEXT_FILE_1 TEXT_FILE_2 ... TEXT_FILE_N [-o OUTPUT_FILE] [-min MIN_WORD_LENGTH] [-max MAX_WORD_LENGTH] #Defaults... May be changed in the command line $min_long=4; $max_long=8; $fout="dict_clean.txt"; #begin parsing arguments at the command line... $i=0; $m=0; while($ARGV[$i]){ if($ARGV[$i] eq "-o"){ $i++; $fout=$ARGV[$i]; $i++; } elsif($ARGV[$i] eq "-min"){ $i++; $long_min=$ARGV[$i]; $i++; } elsif($ARGV[$i] eq "-max"){ $i++; $long_max=$ARGV[$i]; $i++; } else{ $rawfiles[$m]=$ARGV[$i]; $i++;$m++; } } open( DICT, ">$fout") or die "Can't open output file $fout..."; foreach $raw_dict (@rawfiles){ open( RAW,$raw_dict) or die "Could't open input file $raw_dict\n"; while () { for (split) { $_=~tr/A-Z/a-z/; @a=m/\w{$min_long,$max_long}/g; if($a[0]){ $count{$a[0]}++; } } } close (RAW); } foreach $key (sort keys %count){ print DICT "$key\n"; } close (DICT);