urllib とか BeautifulSoupでのスクレイピングとかの練習を兼ねて作ってみた。何に使うかは想像にお任せする;p
# coding: utf-8
import urllib
import re
import argparse
import os
from BeautifulSoup import BeautifulSoup
script_version = 'v0.0.1'
re_image = re.compile(".+\.(jpg|png|gif)")
def get_linked_images(soup):
for a in soup("a"):
for i in a("img"):
a2 = i.parent
if re_image.match(a2["href"]):
image = a2["href"]
file = url_to_filename(image)
try:
print image
urllib.urlretrieve(image, file)
except IOError:
pass
def get_embeded_images(soup):
for i in soup("img"):
image = i["src"]
if re_image.match(image):
file = url_to_filename(image)
try:
print image
urllib.urlretrieve(image, file)
except IOError:
pass
def url_to_filename(url):
filename = url.split('/')[-1]
filename = re.sub('\?.+', '', filename)
if args.dir:
filename = os.path.join(args.dir, filename)
return filename
parser = argparse.ArgumentParser(description="Download images from web page.")
parser.add_argument('url', metavar='URL', nargs='?', action='store',
help='specify URL.')
parser.add_argument('-v', '--version', dest='version', action='store_true',
help='show version and exit')
parser.add_argument('-e', '--embeded-image', dest='embeded', action='store_true',
help='download embeded images(default)')
parser.add_argument('-l', '--linked-image', dest='linked', action='store_true',
help='download linked images')
parser.add_argument('-d', '--dir', dest='dir', metavar='DIR', action='store',
help='download into DIR')
args = parser.parse_args()
if args.version:
print script_version
exit()
url = args.url
if args.dir:
os.makedirs(args.dir)
print "Download images from " + url + "\n"
res = urllib.urlopen(url).read()
soup = BeautifulSoup(res)
if args.linked:
get_linked_images(soup)
else:
get_enbeded_images(soup)