download webscpaper_filter.pl
Language: Perl
LOC: 42
Project Info
test-akihito-project - This is Test Project(test-akihito-project)
Server: Google
Type: svn
...ito‑project\trunk\perl\bot\
   author.pl
   flickr.pl
   ikea-xpath.pl
   ikea.pl
   japanese_kotowaza.pl
   nirayama.pl
   nitori.pl
   webscpaper_filter.pl

#!/usr/bin/perl

use LWP::UserAgent;
use Web::Scraper;
use URI;
use YAML;
use XML::Feed;
use URI::Fetch;

my $term_type = qr/^名詞,一般/;
my $unterm    = qr/世界|作品|資料/;

my $url = 'http://example.com/hoge/foo.rss';
my $response = $ua->get($url);
my $content = $response->content;

my $feed = XML::Feed->parse(\$content) or die XML::Feed->errstr;

my $scraper = [];
for my $entry ($feed->entries) {
    my $url = $entry->link;
    sleep(2);
    push @$scraper,scraper {
       process '//div[@id="incontents"]/h1[@class="ch04"]/span/a','title' => 'TEXT','link'=>sub{$url}; 
       process '//div[@id="contents"]/div[@id="incontents"]/p[position() < last() ]','description[]' => ['TEXT','Term']; 
       result qw/title link description/
    }->scrape(URI->new($url));
}
print YAML::Dump($scraper);


package Web::Scraper::Filter::Term;

use strict;
use base qw(Web::Scraper::Filter);
use Text::MeCab;
use Encode;

sub filter {
    my($self, $value) = @_;
    my $mecab  = Text::MeCab->new();
    my $result = [];
    for ( my $node = $mecab->parse($value) ; $node ; $node = $node->next ) {
        if( $node->feature =~ $term_type && $node->surface !~ $unterm ){
            my $surface = $node->surface;
            if( !utf8::is_utf8($surface) ){utf8::decode($surface)};
            push @$result,$surface;
        }
    }
    return $result;
}


__END__

About Koders | Resources | Downloads | Support | Black Duck | Submit Project | Terms of Service | DMCA | Privacy Policy | Site Map| Contact Us