download gen-dict.pl
Language: Perl
License: GPL
Copyright: (C) 2005, Andrew Rodland
LOC: 82
Project Info
WikiOnCD
Server: BerliOS (SVN)
Type: svn
...\w\wikioncd\trunk\wikioncd\
   build-dictionary.pl
   bzr-inline.pm
   bzr.pm
   convert.pl
   count-words.pl
   dictcomp.pl
   dictcomp.pm
   gen-dict.pl
   inline-bzip2.pl
   parser.pl
   retrieve.pl
   server.pl
   w2h.pl

###
# WikiOnCD Conversion Tool
# Part of WikiOnCD
# Copyright (C) 2005, Andrew Rodland <arodland@entermail.net>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

use IO::File;

$::savings_threshold = 256;

sub load_dict {
	open my $in, "<", "wordcounts" or die $!;
	my @words;

	my $n = 0;
	
	while(<$in>) {
		chomp;
		my ($word, $count) = split ' ', $_;
		my $savings = $count * (length($word) - 1) - length($word);
		if ($savings >= $::savings_threshold) {
			push @words, [$word, $savings, $count];
		}
		$n ++;
		unless ($n % 1000) {
			print "\rLoaded $n words.";
		}
	}
	
	print "\rLoaded $n words.\n";
	return @words;
}

sub gen_dict {

	local *list = \@_;	

	open my $dict, ">dictionary" or die $!; 
	my $bs = 0;
	my $cont = 0;

	print STDERR "Kept ", scalar @list, " words.\n";
	print STDERR "Sorting...\n";

	for (1 .. 64) {
		my $max = $list[0][1];
		my $max_ind = 0;

		for (0 .. $#list) {
			if ($list[$_][1] > $max) {
				$max_ind = $_;
				$max = $list[$_][1];
			}
		}

		goto out if $max < $::savings_threshold;

		$bs += $max;
		print $dict $list[$max_ind][0], "\n";	
		splice @list, $max_ind, 1, ();
	}

	print "Phase 2: Filtering... ";
	for (@list) {
		$_->[1] = $_->[2] * (length($_->[0]) - 2) - length($_->[0]);
	}

	@list = grep { $_->[1] >= $::savings_threshold } @list;
	print STDERR scalar @list, " words.\n";

	print STDERR "Sorting...\n";

	for (1 .. 8001) {
		my $max = $list[0][1];
		my $max_ind = 0;

		for (0 .. $#list) {
			if ($list[$_][1] > $max) {
				$max_ind = $_;
				$max = $list[$_][1];
			}
		}

		goto out if $max < $::savings_threshold;

		$bs += $max;
		print $dict $list[$max_ind][0], "\n";	
		splice @list, $max_ind, 1, ();
	}

	for (@list) {
		$_->[1] = $_->[2] * (length($_->[0]) - 3) - length($_->[0]);
	}

	print STDERR "Phase 3: Filtering... ";
	@list = grep { $_->[1] >= $::savings_threshold } @list;
	print STDERR scalar @list, " words.\n";


	if (@list > 2056320) {
		splice @list, 2056320;
	}

	for (@list) {
		print $dict $_->[0], "\n";
		$bs += $_->[1];
	}

out: close $dict;
	 print "$bs possible bytes saved.\n";
}

$|++;

my @list = load_dict();
gen_dict(@list);

About Koders | Resources | Downloads | Support | Black Duck | Submit Project | Terms of Service | DMCA | Privacy Policy | Site Map| Contact Us