Changes

Jump to navigation Jump to search
Created page with "This page provides resources for the PhD Masterclass "How to Build a Web Crawler", which I gave on Friday 28th January 2011 to interested PhD students at Haas. ==Tools== *[h..."
perl Script1.pl
Or we can run it in command Komodo by going:
Debug->Go
Use PuTTY to connect to bear.haas.berkeley.edu (see [[Research Computing At Haas|here]]).
perl Script1.pl
 
===Processing Text Data===
#!/usr/bin/perl -w
#Lines that start with a # are comments that aren't read by the interpreter
use strict;
#The strict module forces us to declare variables before we use them
my @Textfile;
#Declare an array called TextFile
open (DATA,"Data.txt");
#Open a filehandle on our file
while (<DATA>) {
#Read the data from the filehandle, line by line
chomp $_;
#$_ is a special variable - it captures the line being read from the filehandle here
if (!$_) {next;}
#if the line is undefined (i.e. blank) move to the next loop iteration
my $line = $_;
#Set a local variable called line to $_
push (@Textfile, $line);
#Push the line onto the Textfile array
}
my $Doccell;
#Declare the Doccell variable
for (my $i=0; $i<=$#Textfile; $i++) {
#Do a for loop, starting from i=0, going while i is less than the
#last index of the Textfile array, and incrementing by one each time
if ($Textfile[$i]=~/^Document\(s\):/) {$Doccell=$i;}
#Test to see if the entry matches a regular expression, if it does record the index
}
my @docs = splice(@Textfile,$Doccell);
#Create a next array by splicing out everything after the index we just found
shift @docs;
#Remove the first element of the docs array
my $Firm = shift @Textfile;
#Set Firm equal to the first element of Textfile (which we just removed)
my $Violation =shift(@Textfile);
#Set Violation equal to the (new) first element of Textfile (which we just removed)
my $Offense={};
#Create an anonymous hash
foreach my $cell (@Textfile) {\
#Iterative over Textfile, setting the current iteration to cell
my ($name,@value)=split(":",$cell);
#Spill the cell on :
my $value=join(":",@value);
#Join the Value array on :
$Offense->{$name}=$value;
#Set an entry in the Offense hash
}
$Offense->{"DocList"}=\@docs;
#Set the doclist entry in the Offense hash to a reference to the docs array
my $Master=[];
#Define an anonymous array
$Master->[0]={};
#Define an anonymous hash in the zeroth cell of the anonymous array
$Master->[0]->{FirmName}=$Firm;
#Set a hash entry
$Master->[0]->{Offense}=$Offense;
#Set a hash entry
$Master->[0]->{Violation}=$Violation;
#Set a hash entry
open(OUTPUT,">Result.txt");
#Open a filehandle for writing (overwrite the file if it exists)
print OUTPUT $Master->[0]->{FirmName};
#Print the output file an entry from the anonymous hash in the anonymous array
print OUTPUT "\t";
#Print a tab
print OUTPUT $Master->[0]->{Violation}."\t";
#Print another entry with another tab on the end
foreach my $key ( sort {$a cmp $b } (keys %{ $Master->[0]->{Offense} } )) {
#Iterate through the hash's keys, in alphabetical order, setting the current key to $key
print OUTPUT $Master->[0]->{Offense}->{$key}."\t";
#Print an entry, with a tab
}
print OUTPUT "\n";
#Print a new line
close OUTPUT;
#Close the output filehandle - this will flush the write buffer
==Modules==
use LWP::UserAgent;
#Use the LWP::UserAgent modules
my $ua = LWP::UserAgent->new;
#Create a new UserAgent
my $url="http://www.contractormisconduct.org/index.cfm/1,73,222,html?CaseID=2";
#Set up a string containing a URL
my $response = $ua->get($url);
#Use the UA 'get' method to retrieve the webpage. This returns an HTTP Response object
my $content=$response->decoded_content;
#Get the response as one long text string, so we can work with it...
use HTML::TreeBuilder;
#Use the HTML::TreeBuilder modules
my $tree = HTML::TreeBuilder->new; # empty tree
#Create a new tree object
$tree->parse($content);
#Load up the tree from the content string (that we got using UA)
my $dump=$tree->as_text;
#Dump the tree as text maybe
my $incidentelement=$tree->look_down("id","primecontent");
#Or use HTML::Element methods to look_down the tree for a tag with some properties
 
==An Example Webcrawler==
 
I wrote the following simple webcrawler for a fellow PhD student:
 
#!/usr/bin/perl -w
use strict;
use LWP::UserAgent;
#Use the LWP::UserAgent modules
use HTML::TreeBuilder;
#Use the HTML::TreeBuilder modules
my $ua = LWP::UserAgent->new;
#Create a new UserAgent
my @Pkids;
open (PKIDS,"Pkidfile.txt") || die "Can't open the PKID file to read $!";
#Open the Pkid file to read - this file has a Pkid on each line. You can get some from here: http://myaccount.sdar.com/RealtorSrch.asp
while (<PKIDS>) {
#Read the pkid file line by line
chomp $_;
#Remove the \n (newline symbol) from each line
push(@Pkids,$_);
#Add the PKID to an array
}
open (RESULTS,">Results.txt") || die "Can't write the Results.txt file $!";
#Open the Results file to write
my $headerflag=0;
#Set a flag to indicate whether we wrote the header line to the output file
foreach my $Pkid (sort (@Pkids)) {
#Go through the PKIDs in order
my $url="http://myaccount.sdar.com/RealtorSrchDetail.asp?PKID=".$Pkid;
#Set up a string containing a URL
my $response = $ua->get($url);
#Use the UA 'get' method to retrieve the webpage. This returns an HTTP Response object
my $content=$response->decoded_content;
#Get the response as one long text string, so we can work with it...
my $tree = HTML::TreeBuilder->new; # empty tree
#Create a new tree object
$tree->parse($content);
#Load up the tree from the content string (that we got using UA)
my $name=$tree->look_down("width","520");
#Find an element in the HTML that has width=520 (this is where names are stored)
my $nametext=$name->as_text;
#Convert it to text
$nametext=~s/^\s{1,}//;
#Remove leading spaces
$nametext=~s/\s{1,}$//;
#Remove trailing spaces
$nametext=~s/^\s{2,}/ /g;
#Replace double spaces with a single space, globally
my @fieldstext;
#Declare an array
my @fields=$tree->look_down("class","field_labels");
#Find all of the field elements
foreach my $field (@fields) {
#Go through them
my $fieldparent=$field->parent;
#Go to their parent
my $fieldparenttext=$fieldparent->as_text;
#Turn the parent into text
$fieldparenttext=~s/^\s{1,}//; $fieldparenttext=~s/\s{1,}$//; $fieldparenttext=~s/^\s{2,}/ /g;
#Deal with spaces again
push @fieldstext,$fieldparenttext;
#Add the fields to a list
}
&writeoutput($Pkid,$nametext,@fieldstext);
#Call the write output subroutine
$content=undef; $tree=undef; $name=undef; undef @fields;
#Set a bunch of variables to undefined - this frees up memory
sleep(2);
#Pause for a second or two...
}
close (RESULTS);
#Close the Results filehandle - this flushes the write buffer
sub writeoutput {
#Declare the writeoutput subroutine
my $data={};
#Set up an anonymous hash
$data->{"A Pkid"}=shift @_;
#Set the A PKID field to the first parameter passed to the subroutine
$data->{"A Name"}=shift @_;
#Set the A PKID field to the second parameter passed to the subroutine (the first has now gone)
push(my @fields,@_);
#Add the remaining parameters to an array
foreach my $field (@fields) {
#Go through the array
my @fieldparts=split(":",$field);
#Split the fields on semicolon
my $key=shift(@fieldparts);
#Set the key
$data->{$key}=join(":",@fieldparts);
#Write the hash entry
}
if (!$headerflag) {
#If the headflag is 0 then do this
foreach my $key (sort {$a cmp $b} (keys %{$data})) {
#Go through the keys
print RESULTS $key."\t";
#Write the key followed by a tab
}
print RESULTS "\n";
#Print a newline
$headerflag=1;
#Set the headflag to 1
}
foreach my $key (sort {$a cmp $b} (keys %{$data})) {
#Go through the keys again
print RESULTS $data->{$key}."\t";
#This time print the data followed by tabs
}
print RESULTS "\n";
#print a newline
}
print "Thanks to Ed";
#Thank Ed.
 
[[category:McNair Admin]]
[[admin_classification::Software Tutorial| ]]

Navigation menu