Using this package I had written a simple script that could extract pictures from simple HTML pages. While experimenting it with many websites I noticed some websites did not allow my script to reap or extract pictures.
Then I found that it is because of the User-Agent string in the HTTP request. Some websites do not respond properly to HTTP requests that have unknown User-Agent string. I then used the urllib's FancyURLOpener to change the User-Agent to Firefox.
Now that the script work on most of the websites, i would like to post the code here. Feel free to modify to make it more robust and reliable, post a comment or a link to your modified script.
The code is here (or a link for you to download):
from sgmllib import SGMLParser
import sys
import os
import re
import urllib
from urllib import FancyURLopener
from urlparse import urlparse
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href:
self.urls.extend(href)
class MyURLOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6'
def url_mapper(var):
regexp = re.compile('.+?(http://.+&?)', re.I)
values = urlparse(var)
if len(values) >= 5 and values[4] != '':
obj = regexp.search(values[4])
if obj:
values = urlparse(obj.group(1))
else:
if values[4] != '': print('Could not get real url from:' + values[4])
site = re.sub('[^0-9a-zA-Z_/]', '_', values[1])
location_file = re.findall('(.+)/(.+)', values[2])
try:
if len(location_file[0]) != 2:
return ''
except IndexError:
return ''
location = location_file[0][0]
file = location_file[0][1]
location = re.sub('\.{2,}/', '/', location)
location = re.sub('[^0-9a-zA-Z_/]', '_', location)
location = location + '/' + file
return site + location
if __name__ == '__main__':
url_opener = MyURLOpener()
parser = URLLister()
sys.argv[1] = urllib.unquote(sys.argv[1])
try:
usock = url_opener.open(sys.argv[1])
except IOError:
print 'skipping ' + sys.argv[1]
sys.exit(0)
parser.feed(usock.read())
parser.close()
usock.close()
count = 0
urlfs = urlparse(sys.argv[1])
print "Url fs: ", urlfs
parent = urlfs[1]
try:
parent += urlfs[2]
parent = re.search('(.+)/.+',parent).group(1)
except IndexError:
parent = urlfs[1]
for img_url in parser.urls:
if re.search('\.jpe?g$', img_url):
print "looking at: " + img_url
if not re.match('^http://', img_url):
img_url = 'http://' + parent + '/' + img_url
loc = url_mapper(img_url)
if os.path.exists(loc): continue
url_opener1 = MyURLOpener()
retrieve = url_opener1.retrieve
if loc == '':
print "\turl_mapper returned NULL for " + img_url
continue
loc_dir = re.match('(.+)/.+', loc).group(1)
try:
print('\ttrying to save ' + img_url)
if not os.path.exists(loc_dir):
os.makedirs(loc_dir)
retrieve(img_url, loc)
except IOError:
print('\tSkipping saving ' + img_url)
continue
count += 1
print "\tImg fetched: ",count


