Changes

4,567 bytes added , 13:00, 18 July 2016

Created page with "This page provides resources for the PhD Masterclass "How to Build a Web Crawler", which I gave on Friday 28th January 2011 to interested PhD students at Haas. ==Tools== *[h..."

perl Script1.pl

Or we can run it in ~~command~~ Komodo by going:

Debug->Go

Use PuTTY to connect to bear.haas.berkeley.edu (see [[Research Computing At Haas|here]]).

perl Script1.pl

===Processing Text Data===

#!/usr/bin/perl -w

#Lines that start with a # are comments that aren't read by the interpreter

use strict;

#The strict module forces us to declare variables before we use them

my @Textfile;

#Declare an array called TextFile

open (DATA,"Data.txt");

#Open a filehandle on our file

while (<DATA>) {

#Read the data from the filehandle, line by line

chomp $_;

#$_ is a special variable - it captures the line being read from the filehandle here

if (!$_) {next;}

#if the line is undefined (i.e. blank) move to the next loop iteration

my $line = $_;

#Set a local variable called line to $_

push (@Textfile, $line);

#Push the line onto the Textfile array

}

my $Doccell;

#Declare the Doccell variable

for (my $i=0; $i<=$#Textfile; $i++) {

#Do a for loop, starting from i=0, going while i is less than the

#last index of the Textfile array, and incrementing by one each time

if ($Textfile[$i]=~/^Document$s$:/) {$Doccell=$i;}

#Test to see if the entry matches a regular expression, if it does record the index

}

my @docs = splice(@Textfile,$Doccell);

#Create a next array by splicing out everything after the index we just found

shift @docs;

#Remove the first element of the docs array

my $Firm = shift @Textfile;

#Set Firm equal to the first element of Textfile (which we just removed)

my $Violation =shift(@Textfile);

#Set Violation equal to the (new) first element of Textfile (which we just removed)

my $Offense={};

#Create an anonymous hash

foreach my $cell (@Textfile) {\

#Iterative over Textfile, setting the current iteration to cell

my ($name,@value)=split(":",$cell);

#Spill the cell on :

my $value=join(":",@value);

#Join the Value array on :

$Offense->{$name}=$value;

#Set an entry in the Offense hash

}

$Offense->{"DocList"}=\@docs;

#Set the doclist entry in the Offense hash to a reference to the docs array

my $Master=[];

#Define an anonymous array

$Master->[0]={};

#Define an anonymous hash in the zeroth cell of the anonymous array

$Master->[0]->{FirmName}=$Firm;

#Set a hash entry

$Master->[0]->{Offense}=$Offense;

#Set a hash entry

$Master->[0]->{Violation}=$Violation;

#Set a hash entry

open(OUTPUT,">Result.txt");

#Open a filehandle for writing (overwrite the file if it exists)

print OUTPUT $Master->[0]->{FirmName};

#Print the output file an entry from the anonymous hash in the anonymous array

print OUTPUT "\t";

#Print a tab

print OUTPUT $Master->[0]->{Violation}."\t";

#Print another entry with another tab on the end

foreach my $key ( sort {$a cmp $b } (keys %{ $Master->[0]->{Offense} } )) {

#Iterate through the hash's keys, in alphabetical order, setting the current key to $key

print OUTPUT $Master->[0]->{Offense}->{$key}."\t";

#Print an entry, with a tab

}

print OUTPUT "\n";

#Print a new line

close OUTPUT;

#Close the output filehandle - this will flush the write buffer

==Modules==

use LWP::UserAgent;

#Use the LWP::UserAgent modules

my $ua = LWP::UserAgent->new;

#Create a new UserAgent

my $url="http://www.contractormisconduct.org/index.cfm/1,73,222,html?CaseID=2";

#Set up a string containing a URL

my $response = $ua->get($url);

#Use the UA 'get' method to retrieve the webpage. This returns an HTTP Response object

my $content=$response->decoded_content;

#Get the response as one long text string, so we can work with it...

use HTML::TreeBuilder;

#Use the HTML::TreeBuilder modules

my $tree = HTML::TreeBuilder->new; # empty tree

#Create a new tree object

$tree->parse($content);

#Load up the tree from the content string (that we got using UA)

my $dump=$tree->as_text;

#Dump the tree as text maybe

my $incidentelement=$tree->look_down("id","primecontent");

#Or use HTML::Element methods to look_down the tree for a tag with some properties

==An Example Webcrawler==

I wrote the following simple webcrawler for a fellow PhD student:

#!/usr/bin/perl -w

use strict;

use LWP::UserAgent;

#Use the LWP::UserAgent modules

use HTML::TreeBuilder;

#Use the HTML::TreeBuilder modules

my $ua = LWP::UserAgent->new;

#Create a new UserAgent

my @Pkids;

open (PKIDS,"Pkidfile.txt") || die "Can't open the PKID file to read $!";

#Open the Pkid file to read - this file has a Pkid on each line. You can get some from here: http://myaccount.sdar.com/RealtorSrch.asp

while (<PKIDS>) {

#Read the pkid file line by line

chomp $_;

#Remove the \n (newline symbol) from each line

push(@Pkids,$_);

#Add the PKID to an array

}

open (RESULTS,">Results.txt") || die "Can't write the Results.txt file $!";

#Open the Results file to write

my $headerflag=0;

#Set a flag to indicate whether we wrote the header line to the output file

foreach my $Pkid (sort (@Pkids)) {

#Go through the PKIDs in order

my $url="http://myaccount.sdar.com/RealtorSrchDetail.asp?PKID=".$Pkid;

#Set up a string containing a URL

my $response = $ua->get($url);

#Use the UA 'get' method to retrieve the webpage. This returns an HTTP Response object

my $content=$response->decoded_content;

#Get the response as one long text string, so we can work with it...

my $tree = HTML::TreeBuilder->new; # empty tree

#Create a new tree object

$tree->parse($content);

#Load up the tree from the content string (that we got using UA)

my $name=$tree->look_down("width","520");

#Find an element in the HTML that has width=520 (this is where names are stored)

my $nametext=$name->as_text;

#Convert it to text

$nametext=~s/^\s{1,}//;

#Remove leading spaces

$nametext=~s/\s{1,}$//;

#Remove trailing spaces

$nametext=~s/^\s{2,}/ /g;

#Replace double spaces with a single space, globally

my @fieldstext;

#Declare an array

my @fields=$tree->look_down("class","field_labels");

#Find all of the field elements

foreach my $field (@fields) {

#Go through them

my $fieldparent=$field->parent;

#Go to their parent

my $fieldparenttext=$fieldparent->as_text;

#Turn the parent into text

$fieldparenttext=~s/^\s{1,}//; $fieldparenttext=~s/\s{1,}$//; $fieldparenttext=~s/^\s{2,}/ /g;

#Deal with spaces again

push @fieldstext,$fieldparenttext;

#Add the fields to a list

}

&writeoutput($Pkid,$nametext,@fieldstext);

#Call the write output subroutine

$content=undef; $tree=undef; $name=undef; undef @fields;

#Set a bunch of variables to undefined - this frees up memory

sleep(2);

#Pause for a second or two...

}

close (RESULTS);

#Close the Results filehandle - this flushes the write buffer

sub writeoutput {

#Declare the writeoutput subroutine

my $data={};

#Set up an anonymous hash

$data->{"A Pkid"}=shift @_;

#Set the A PKID field to the first parameter passed to the subroutine

$data->{"A Name"}=shift @_;

#Set the A PKID field to the second parameter passed to the subroutine (the first has now gone)

push(my @fields,@_);

#Add the remaining parameters to an array

foreach my $field (@fields) {

#Go through the array

my @fieldparts=split(":",$field);

#Split the fields on semicolon

my $key=shift(@fieldparts);

#Set the key

$data->{$key}=join(":",@fieldparts);

#Write the hash entry

}

if (!$headerflag) {

#If the headflag is 0 then do this

foreach my $key (sort {$a cmp $b} (keys %{$data})) {

#Go through the keys

print RESULTS $key."\t";

#Write the key followed by a tab

}

print RESULTS "\n";

#Print a newline

$headerflag=1;

#Set the headflag to 1

}

foreach my $key (sort {$a cmp $b} (keys %{$data})) {

#Go through the keys again

print RESULTS $data->{$key}."\t";

#This time print the data followed by tabs

}

print RESULTS "\n";

#print a newline

}

print "Thanks to Ed";

#Thank Ed.

[[category:McNair Admin]]

[[admin_classification::Software Tutorial| ]]

ShoebMohammed

Bureaucrats, Administrators (Semantic MediaWiki), Administrators

170

edits

Changes

PhD Masterclass - How to Build a Web Crawler (view source)

Revision as of 13:00, 18 July 2016

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

Sites

Sections

Organizations

Help

Tools