Use Perl Mechanize and LWP to Scrape Site Data
LWP is a great module for scraping data. This script example was created for a customer that wanted to profile various cities for marketing purposes.
It should be said that complex regex is not generally the best way to parse pages. If the page changes, the regex will no longer parse like expected. A better tool might be anything else besides regex!
With that said, this is also a good demo of how to use LWP.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
#!/usr/bin/perl # Description: # initial research for home purchases/wholesaling use warnings; use strict; # hot pipes $|++; # time it my $start = time; use LWP::UserAgent; # force the bad ssl override # this allows it to work, but gives warning below BEGIN { $ENV{PERL_LWP_SSL_VERIFY_HOSTNAME} = 0 } # bug, need lwp protocol https 6.0.4 to stifle warning #******************************************************************* #Using the default of SSL_verify_mode of SSL_VERIFY_NONE for client #is deprecated! Please set SSL_verify_mode to SSL_VERIFY_PEER #together with SSL_ca_file|SSL_ca_path for verification. #If you really don't want to verify the certificate and keep the #connection open to Man-In-The-Middle attacks please set #SSL_verify_mode explicitly to SSL_VERIFY_NONE in your application. #******************************************************************* # initial user agent to defeat possibly script detection # if still blocked, add random timer and chill my $magent = 'Mozilla/5.0 (Linux; U; Android 2.2; de-de; HTC Desire HD 1.18.161.2 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'; # ssl_opts aren't working as per bug my $agent = LWP::UserAgent->new( ssl_opts => { verify_hostname => 0, SSL_verify_mode => 0, agent => $magent } ); # mechanize lets us emulate a browser use WWW::Mechanize; # requires manual population initially to get zips my @zips = qw( 75098 75160 75135 74012 74136 74015 ); # total number of zips my $number = @zips; # strip data and put in hash my %d; # print header and reset open FILE, ">", 'zips.csv' or die $!; print FILE "ZIPCODE,POPULATION,CITY,INCOME,MEDIAN,DETACHED,RENT,RENT\%,WHITE,BLACK,ASIAN,HISPANIC,URL\n"; close FILE; foreach my $zip (@zips) { # check a url my $url = 'http://www.city-data.com/zips/' . $zip . '.html'; my $response = $agent->get($url); if( $response->is_success ){ my $mech = WWW::Mechanize->new(); $mech->get($url) or die "FAILED $!\n"; if ($mech->success) { warn $url . "\n"; # is there a way to process line by line instead? my @test = split(/\n/,$mech->content); my $lines = @test; foreach my $line (@test) { $line =~s/\,//g; # to keep or not to keep, that is question if ($line =~ m/title\>$zip Zip Code \(([a-zA-Z ]*) (Texas|TX|OK|Oklahoma)\)/){ $d{'city'} = $1; #warn "CITY:\t" . $d{'city'} . "\n"; } if ($line =~ m/population in 2016\:\<\/b\> (\d+)/) { $d{'pop'} = $1; #warn "POP:\t" . $d{'pop'} . "\n"; } if ($line =~ m/\<b\>Estimated median house\/condo value in 2016\:\<\/b\> \$(\d+)/) { $d{'median'} = $1; #warn "MEDIAN:\t" . $d{'median'} . "\n"; } if ($line =~ m/\<b\>Detached houses\:\<\/b\> \$(\d+)\<div/) { $d{'detached'} = $1; #warn "DETACHED:\t" . $d{'detached'} . "\n"; } if ($line =~ m/Estimated median household income in 2016: \<\/b\>\<\/b\>\<table\>\<tr\>\<td\>\<b\>This zip code\:\<\/b\>\<\/td\>\<td\>\<p class\=\'h\' style\=\'padding-left\:\d+px\;\'\>\<\/p\>\$(\d+)\<\/td\>\<\/tr\>/) { $d{'income'} = $1; #warn "INCOME: " . $d{'income'} . "\n"; } if ($line =~ m/Median gross rent in 2016\:\<\/b\> \$(\d+)/) { $d{'rent'} = $1; #warn "RENT: " . $d{'rent'} . "\n"; } if ($line =~ m/\% of renters here\:\<\/b\>\<\/td\>\<td\>\<p class='h' style='padding-left:\d+px\;'\>\<\/p\>(\d+)\%/) { $d{'renters'} = $1; #warn "RENTERS: " . $d{'renters'} . "\n"; } if ($line =~ m/\<span class='badge'\>(\d+)\<\/span\>White population/) { $d{'white'} = $1; #warn "WHITE: " . $d{'white'} . "\n"; } if ($line =~ m/\<span class='badge'\>(\d+)\<\/span\>Black population/) { $d{'black'} = $1; #warn "BLACK: " . $d{'black'} . "\n"; } if ($line =~ m/\<span class='badge'\>(\d+)\<\/span\>Asian population/) { $d{'asian'} = $1; #warn "ASIAN: " . $d{'asian'} . "\n"; } if ($line =~ m/\<span class='badge'\>(\d+)\<\/span\>Hispanic or Latino population/) { $d{'hispanic'} = $1; #warn "HISPANIC: " . $d{'hispanic'} . "\n"; } } # write each config to a separate file open FILE, ">>", 'zips.csv' or die $!; print FILE "$zip,$d{'pop'},$d{'city'},$d{'income'},$d{'median'},$d{'detached'},$d{'rent'},$d{'renters'},$d{'white'},$d{'black'},$d{'asian'},$d{'hispanic'},$url,0,0\n"; close FILE; } else {die "mech failed, '$!'\n";} } else {die "connect failed,'$!'\n";} } my $duration = time - $start; print "Processed $number zips in $duration seconds\n"; exit; |