Changes

Jump to navigation Jump to search
Created page with "This page provides resources for the PhD Masterclass "How to Build a Web Crawler", which I gave on Friday 28th January 2011 to interested PhD students at Haas. ==Tools== *[h..."
perl Script1.pl
Or we can run it in command Komodo by going:
Debug->Go
Use PuTTY to connect to bear.haas.berkeley.edu (see [[Research Computing At Haas|here]]).
perl Script1.pl
 
===Processing Text Data===
my $incidentelement=$tree->look_down("id","primecontent");
#Or use HTML::Element methods to look_down the tree for a tag with some properties
 
==An Example Webcrawler==
 
I wrote the following simple webcrawler for a fellow PhD student:
 
#!/usr/bin/perl -w
use strict;
use LWP::UserAgent;
#Use the LWP::UserAgent modules
use HTML::TreeBuilder;
#Use the HTML::TreeBuilder modules
my $ua = LWP::UserAgent->new;
#Create a new UserAgent
my @Pkids;
open (PKIDS,"Pkidfile.txt") || die "Can't open the PKID file to read $!";
#Open the Pkid file to read - this file has a Pkid on each line. You can get some from here: http://myaccount.sdar.com/RealtorSrch.asp
while (<PKIDS>) {
#Read the pkid file line by line
chomp $_;
#Remove the \n (newline symbol) from each line
push(@Pkids,$_);
#Add the PKID to an array
}
open (RESULTS,">Results.txt") || die "Can't write the Results.txt file $!";
#Open the Results file to write
my $headerflag=0;
#Set a flag to indicate whether we wrote the header line to the output file
foreach my $Pkid (sort (@Pkids)) {
#Go through the PKIDs in order
my $url="http://myaccount.sdar.com/RealtorSrchDetail.asp?PKID=".$Pkid;
#Set up a string containing a URL
my $response = $ua->get($url);
#Use the UA 'get' method to retrieve the webpage. This returns an HTTP Response object
my $content=$response->decoded_content;
#Get the response as one long text string, so we can work with it...
my $tree = HTML::TreeBuilder->new; # empty tree
#Create a new tree object
$tree->parse($content);
#Load up the tree from the content string (that we got using UA)
my $name=$tree->look_down("width","520");
#Find an element in the HTML that has width=520 (this is where names are stored)
my $nametext=$name->as_text;
#Convert it to text
$nametext=~s/^\s{1,}//;
#Remove leading spaces
$nametext=~s/\s{1,}$//;
#Remove trailing spaces
$nametext=~s/^\s{2,}/ /g;
#Replace double spaces with a single space, globally
my @fieldstext;
#Declare an array
my @fields=$tree->look_down("class","field_labels");
#Find all of the field elements
foreach my $field (@fields) {
#Go through them
my $fieldparent=$field->parent;
#Go to their parent
my $fieldparenttext=$fieldparent->as_text;
#Turn the parent into text
$fieldparenttext=~s/^\s{1,}//; $fieldparenttext=~s/\s{1,}$//; $fieldparenttext=~s/^\s{2,}/ /g;
#Deal with spaces again
push @fieldstext,$fieldparenttext;
#Add the fields to a list
}
&writeoutput($Pkid,$nametext,@fieldstext);
#Call the write output subroutine
$content=undef; $tree=undef; $name=undef; undef @fields;
#Set a bunch of variables to undefined - this frees up memory
sleep(2);
#Pause for a second or two...
}
close (RESULTS);
#Close the Results filehandle - this flushes the write buffer
sub writeoutput {
#Declare the writeoutput subroutine
my $data={};
#Set up an anonymous hash
$data->{"A Pkid"}=shift @_;
#Set the A PKID field to the first parameter passed to the subroutine
$data->{"A Name"}=shift @_;
#Set the A PKID field to the second parameter passed to the subroutine (the first has now gone)
push(my @fields,@_);
#Add the remaining parameters to an array
foreach my $field (@fields) {
#Go through the array
my @fieldparts=split(":",$field);
#Split the fields on semicolon
my $key=shift(@fieldparts);
#Set the key
$data->{$key}=join(":",@fieldparts);
#Write the hash entry
}
if (!$headerflag) {
#If the headflag is 0 then do this
foreach my $key (sort {$a cmp $b} (keys %{$data})) {
#Go through the keys
print RESULTS $key."\t";
#Write the key followed by a tab
}
print RESULTS "\n";
#Print a newline
$headerflag=1;
#Set the headflag to 1
}
foreach my $key (sort {$a cmp $b} (keys %{$data})) {
#Go through the keys again
print RESULTS $data->{$key}."\t";
#This time print the data followed by tabs
}
print RESULTS "\n";
#print a newline
}
print "Thanks to Ed";
#Thank Ed.
 
[[category:McNair Admin]]
[[admin_classification::Software Tutorial| ]]

Navigation menu