download convert.pl
Language: Perl
License: GPL
Copyright: (C) 2005, Andrew Rodland
LOC: 186
Project Info
WikiOnCD
Server: BerliOS (SVN)
Type: svn
...\w\wikioncd\trunk\wikioncd\
   build-dictionary.pl
   bzr-inline.pm
   bzr.pm
   convert.pl
   count-words.pl
   dictcomp.pl
   dictcomp.pm
   gen-dict.pl
   inline-bzip2.pl
   parser.pl
   retrieve.pl
   server.pl
   w2h.pl

###
# WikiOnCD Conversion Tool
# Part of WikiOnCD
# Copyright (C) 2005, Andrew Rodland <arodland@entermail.net>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

use IO::File;
use DB_File;

require 'bzr-inline.pm';

$::blocksize = 256;
$::debug = 0;
$::lang = "en";


sub gen_filename {
	my $prefix = lc substr $_[0], 0, 2;
	$prefix =~ s/[^A-Za-z0-9_]/_/g;
	$prefix .= $prefix if length($prefix) < 2;
	$prefix =~ s/^[0-9]/#/;

	my $first = substr $prefix, 0, 1;

	return ($first, $prefix);
}


sub canonicalize {
	my ($filename, $namespace) = @_;
	$namespace = lc $namespace;
	$namespace .= ":" if $namespace;

	$filename = ucfirst $filename;
	$filename =~ s/[^A-Za-z0-9,.'()\x80-\xff-]/_/g;

	return $namespace . $filename;
}


%namespaces = (
		0 => "",
		4 => "wikipedia",
		10 => "template",
		14 => "category",
		);

sub init_index {

	my $filename = shift;
	open my $fh, $filename or die $!;

	my $n = 0;

	while (!eof($fh)) {
		my ($namespace, $title, $text, undef, $is_redirect) = get_next_record($fh);
		next unless defined($namespace);

		next unless $namespace == 0 || $namespace == 4 || $namespace == 10 ||
			$namespace == 14;

		my $ns = $namespaces{$namespace};

		my $key = canonicalize($title, $ns);

		if ($is_redirect) {
			if ($text =~ /^\#REDIRECT \[\[([^\]]+)\]\]/i) {
				my $target = $1;
				my $targns, $targkey;
			
				if ($target =~ /:/) {
					($targns, $target) = split /:/, $target, 2;
				}
				
				if ($key eq canonicalize($target, $targns)) {
					print STDERR "Wtf? Circular redirect. key=$key ns=$ns\n" if $::debug;
				} else {
					print STDERR $key, " => ", $target, "\n" if $::debug;
					write_redirect($title, $ns, $target, $targns) if $target;
				}
			}
		} else {
			print STDERR $key, "\n" if $::debug;
			write_data($title, $ns, \$text);
		}

		$n ++;
		if ($n % 1000 == 0) {
			print "\r$n ";
		}
	}

	print "\r$n\n";
}

sub write_data {
	my ($title, $ns, $text) = @_;

	RemoveHTMLcomments($text);
#	rewrite_links($text);	

	my ($onechar, $prefix) = gen_filename($title);

	if (!$::did_dir{$onechar}) {
		mkdir "out/$onechar" unless -d "out/$onechar";
		$::did_dir{$onechar} ++;
	}

	if (!defined $::bzr{$prefix}) {
		$::bzr{$prefix} = Compress::Bzip2::RandomAccess->new_to_file(
				"out/$onechar/$prefix.bzr", $::blocksize) or die $!; 
	}

	$::bzr{$prefix}->write_file(canonicalize($title, $ns), $$text) 
}

sub RemoveHTMLcomments {
	my $text = shift;
	my ($comment_start, $comment_end);
	
	$comment_start = "<!--";
	$comment_end = "-->";
	
	$$text =~ s/\Q$comment_start\E.*?\Q$comment_end\E/ /msgo;
}

sub rewrite_links {
	
	my $text = shift;

	$$text =~ s/\[\[([^]]+)]/"[[" . rewrite_link($1) . "]"/ego;
	$$text =~ s/(?<!{){{([^{][^}|\s]+)/"{{" . rewrite_template($1)/ego;
}

sub rewrite_template {
	my $link = shift;
	my $namespace = "t";

	if ($link =~ /^:/) {
		$link =~ s/^://;
	}
	
	if ($link =~ /:/) {
		($namespace, $link) = split ':', $link, 2;

		if ($namespace ne $::lang) {
			if ($::namespace_reverse{lc $namespace}) {
				return $::namespaces{$::namespace_reverse{lc $namespace}} .
					":$link";
			} else {
				return "$namespace:$link";
			}
		}
	} else {
		return $link;
	}
}
	
sub rewrite_link {
	my $link = shift;
	my $namespace = "";

	if ($link =~ /^:/) {
		$link =~ s/^://;
	}
	
	if ($link =~ /:/) {
		($namespace, $link) = split ':', $link, 2;

		if ($namespace ne $::lang) {
			if ($::namespace_reverse{lc $namespace}) {
				$namespace = $::namespaces{$::namespace_reverse{lc $namespace}};
			} else {
				return "$namespace:$link";
			}
		}
	}

	my ($target, $desc) = split /\|/, $link;
	
	my ($page, $anchor) = split /#/, $target;

	if (title_to_key($page, $namespace) ne title_to_web($page)) {
		$ret = title_to_key($page, $namespace);
		if (!$desc) {
			$desc = "|$page";
		}
	} else {
		$ret = $page;
	}

	$ret = "$namespace:$ret" if $namespace;
	$ret .= "#$anchor" if $anchor;
	$ret .= "|$desc" if $desc;

	return $ret;

}

sub write_redirect {
	my ($title, $ns, $target, $targns) = @_;

	my ($prefix, undef) = gen_filename($title);

	if (!$::did_dir{$prefix}) {
		mkdir "out/$prefix" unless -d "out/$prefix";
		$::did_dir{$prefix} ++;
	}

	if (!defined $::redirect{$prefix}) {
		unlink "out/$prefix/redirect";
		tie %{$::redirect{$prefix}}, "DB_File", "out/$prefix/redirect",
			O_RDWR|O_CREAT, 0666 or die "$! opening out/$prefix/redirect";
	}

	my $key = canonicalize($title, $ns);
	$::redirect{$prefix}{$key} = "$targns\0$target";
}

sub get_next_record {
	my $fh = shift;

	while (1) {
		if ($buf =~ m/\(\d+,(\d+),'(.*?)','(.*?)','.*?',\d+,'(.*?)','.*?','.*?',\d+,(\d+).+?\)[,;]/g) {

			my ($ns, $title, $text, $ts, $redir) = ($1, $2, $3, $4, $5);

			%unescape = (
					"'" => "'", '"' => '"', "_" => "_", "%" => "%", "\\" => "\\",
					"n" => "\n", "t" => "\t", "r" => "");

			$title =~ s/\\(.)/$unescape{$1}/eg;
			$text =~ s/\\(.)/$unescape{$1}/eg;

# namespace, title, text, timestamp, is_redirect
			return ($ns, $title, $text, $ts, $redir);
		} else {
			if (! defined($buf = <$fh>)) {
				return (undef);
			}
		}
	}
}


$|++;
my $out = select(STDERR);
$|++;
select($out);

my $filename = $ARGV[0];

mkdir("out");

init_index($filename);

print "Total FH: ", keys(%::redirect) + keys(%::bzr), "\n";

print "Flushing...";
untie %{$::redirect{$_}} for keys %::redirect;
$::bzr{$_}->close_for_write() for keys %::bzr;
print "\n";

About Koders | Resources | Downloads | Support | Black Duck | Submit Project | Terms of Service | DMCA | Privacy Policy | Site Map| Contact Us