BlogDump.pm


stáhnout snippet
zobrazení: SyntaxHighlighter | GeSHi | Holý text
#!/usr/bin/perl

package BlogDump;

use strict;
use warnings;

use Text::Unaccent;
use Data::Dumper;    # debug
use WWW::Mechanize;
use Article;

use constant FEED	=> '/feed/rss';

#
# konstruktor
#
sub new {

	my $self = {
		url      => undef,
		_bot     => undef
	};
	bless $self, 'BlogDump';

	$self->{_bot} = $self->newBot();

	return $self;
}

sub dump {
	my ($self, $url) = @_;

	$self->{url} = $url;
	my $b = $self->{_bot};

	$b->get($url.FEED);

	my $c = $b->content;
	my @r;
	while($c =~ /(http:\/\/[^\/<]+\/[^<]+)/g) {
		print "- ".$1."\n";
		push(@r, $self->dumpArticle($1));
	}
	return @r;
}

sub dumpArticle {
	my ($self, $url) = @_;

	my $b = $self->{_bot};
	$b->get($url);

	my $c = $b->content;

	$a = Article->new();
	$a->{url} = $url;

	$c =~ /<h1>([^<]+)/s;
	$a->{title} = $1;
	print "   - $1\n";

	$c =~ /<div class="article">\s*<div class="top">(.*?)<\/div>(.*?)<\/div>/s;

	my $top     = $1;
	$a->{content} = $2;

	if($top =~ />([^<]+)<\/a>/s) {
		$a->{category} = $1;

		$a->{category} =~ s/>//g;
		$a->{category} =~ s/<//g;
		$a->{category} =~ s/&/&/g;
	}

	$a->{title} =~ s/&/&/g;
	$a->{title} =~ s/>//g;
	$a->{title} =~ s/<//g;

	return $a;
}

#
# Vytvori noveho robota
#
sub newBot {
	my ($self) = @_;

	my $bot = WWW::Mechanize->new();

	$bot->_reset_page;
	$bot->cookie_jar( HTTP::Cookies->new( file => "cookies.dat" ) );
	$bot->agent_alias('Windows IE 6');

	return $bot;
}

1;


Tagy:
perl 98 řádků | 2008-05-29 22:57:37 | air.kadlec@seznam.cz