7bf5fab1fa2fa07cc012c0570781aea23343ae05		bcca862eaaeeb4e5a739a9107f8ae073ef6a37b3
		1	# The contents of this file are subject to the Common Public Attribution
		2	# License Version 1.0. (the "License"); you may not use this file except in
		3	# compliance with the License. You may obtain a copy of the License at
		4	# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
		5	# License Version 1.1, but Sections 14 and 15 have been added to cover use of
		6	# software over a computer network and provide for limited attribution for the
		7	# Original Developer. In addition, Exhibit A has been modified to be consistent
		8	# with Exhibit B.
		9	#
		10	# Software distributed under the License is distributed on an "AS IS" basis,
		11	# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
		12	# the specific language governing rights and limitations under the License.
		13	#
		14	# The Original Code is Reddit.
		15	#
		16	# The Original Developer is the Initial Developer. The Initial Developer of the
		17	# Original Code is CondeNet, Inc.
		18	#
		19	# All portions of the code written by CondeNet are Copyright (c) 2006-2008
		20	# CondeNet, Inc. All Rights Reserved.
		21	################################################################################
		22
		23	from pylons import g
		24	from r2.lib import utils
		25	from r2.lib.memoize import memoize
		26
		27	from urllib2 import Request, HTTPError, URLError, urlopen
		28	import urlparse, re, urllib, logging, StringIO, logging
		29	import Image, ImageFile
		30
		31	log = g.log
		32	useragent = g.useragent
		33
		34	chunk_size = 1024
		35	thumbnail_size = 70, 70
		36
		37	def image_to_str(image):
		38	s = StringIO.StringIO()
		39	image.save(s, image.format)
		40	s.seek(0)
		41	return s.read()
		42
		43	def str_to_image(s):
		44	s = StringIO.StringIO(s)
		45	s.seek(0)
		46	image = Image.open(s)
		47	return image
		48
		49	@memoize('media.fetch_url')
		50	def fetch_url(url, referer = None, retries = 1, dimension = False):
		51	cur_try = 0
		52	#log.debug('fetching: %s' % url)
		53	nothing = None if dimension else (None, None)
		54	while True:
		55	try:
		56	req = Request(url)
		57	if useragent:
		58	req.add_header('User-Agent', useragent)
		59	if referer:
		60	req.add_header('Referer', referer)
		61
		62	open_req = urlopen(req)
		63
		64	#if we only need the dimension of the image, we may not
		65	#need the entire image
		66	if dimension:
		67	content = open_req.read(chunk_size)
		68	else:
		69	content = open_req.read()
		70	content_type = open_req.headers.get('content-type')
		71
		72	if 'image' in content_type:
		73	p = ImageFile.Parser()
		74	new_data = content
		75	while not p.image and new_data:
		76	p.feed(new_data)
		77	new_data = open_req.read(chunk_size)
		78	content += new_data
		79
		80	#return the size, or return the data
		81	if dimension and p.image:
		82	return p.image.size
		83	elif dimension:
		84	return nothing
		85	elif dimension:
		86	#expected an image, but didn't get one
		87	return nothing
		88
		89	return content_type, content
		90
		91	except (URLError, HTTPError), e:
		92	cur_try += 1
		93	if cur_try >= retries:
		94	log.debug('error while fetching: %s referer: %s' % (url, referer))
		95	log.debug(e)
		96	return nothing
		97	finally:
		98	if 'open_req' in locals():
		99	open_req.close()
		100
		101	img_rx = re.compile(r'<\s(?:img)[^>]src\s=\s[\"\']?([^\"\'\s>])[^>]', re.IGNORECASE \| re.S)
		102	def image_urls(base_url, html):
		103	for match in img_rx.findall(html):
		104	image_url = urlparse.urljoin(base_url, match)
		105	yield image_url
		106
		107	class Scraper:
		108	def __init__(self, url):
		109	self.url = url
		110	self.content = None
		111	self.content_type = None
		112
		113	def download(self):
		114	self.content_type, self.content = fetch_url(self.url)
		115
		116	def largest_image_url(self):
		117	if not self.content:
		118	self.download()
		119
		120	#if download didn't work
		121	if not self.content:
		122	return None
		123
		124	max_area = 0
		125	max_url = None
		126
		127	#if the original url was an image, use that
		128	if 'image' in self.content_type:
		129	urls = [self.url]
		130	else:
		131	urls = image_urls(self.url, self.content)
		132
		133	for image_url in urls:
		134	size = fetch_url(image_url, referer = self.url, dimension = True)
		135	if not size:
		136	continue
		137
		138	area = size[0] * size[1]
		139
		140	#ignore little images
		141	if area < 5000:
		142	log.debug('ignore little %s' % image_url)
		143	continue
		144
		145	#ignore excessively long/wide images
		146	if max(size) / min(size) > 1.5:
		147	log.debug('ignore dimensions %s' % image_url)
		148	continue
		149
		150	if area > max_area:
		151	max_area = area
		152	max_url = image_url
		153
		154	return max_url
		155
		156	def thumbnail(self):
		157	image_url = self.largest_image_url()
		158	if image_url:
		159	content_type, image_str = fetch_url(image_url, referer = self.url)
		160	if image_str:
		161	image = str_to_image(image_str)
		162	image.thumbnail(thumbnail_size, Image.ANTIALIAS)
		163	return image
		164
		165	def media_object(self):
		166	return None
		167
		168	youtube_rx = re.compile('.v=([A-Za-z0-9-_]+).')
		169
		170	class YoutubeScraper(Scraper):
		171	media_template = '<object width="425" height="350"><param name="movie" value="http://www.youtube.com/v/%s"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/%s" type="application/x-shockwave-flash" wmode="transparent" width="425" height="350"></embed></object>'
		172
		173	def __init__(self, url):
		174	m = youtube_rx.match(url)
		175	if m:
		176	self.video_id = m.groups()[0]
		177	else:
		178	#if it's not a youtube video, just treat it like a normal page
		179	log.debug('reverting youtube to regular scraper: %s' % url)
		180	self.__class__ = Scraper
		181
		182	Scraper.__init__(self, url)
		183
		184	def largest_image_url(self):
		185	return 'http://img.youtube.com/vi/%s/default.jpg' % self.video_id
		186
		187	def media_object(self):
		188	return self.media_template % (self.video_id, self.video_id)
		189
		190	gootube_rx = re.compile('.videoplay\?docid=([A-Za-z0-9-_]+).')
		191	gootube_thumb_rx = re.compile(".thumbnail:\s\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE \| re.S)
		192
		193	class GootubeScraper(Scraper):
		194	media_template = '<embed style="width:400px; height:326px;" id="VideoPlayback" type="application/x-shockwave-flash" src="http://video.google.com/googleplayer.swf?docId=%s&hl=en" flashvars=""> </embed>'
		195	def __init__(self, url):
		196	m = gootube_rx.match(url)
		197	if m:
		198	self.video_id = m.groups()[0]
		199	else:
		200	self.__class__ = Scraper
		201	Scraper.__init__(self, url)
		202
		203	def largest_image_url(self):
		204	if not self.content:
		205	self.download()
		206
		207	if not self.content:
		208	return None
		209
		210	m = gootube_thumb_rx.match(self.content)
		211	if m:
		212	image_url = m.groups()[0]
		213	image_url = utils.safe_eval_str(image_url)
		214	return image_url
		215
		216	def media_object(self):
		217	return self.media_template % self.video_id
		218
		219	scrapers = {'youtube.com': YoutubeScraper,
		220	'video.google.com': GootubeScraper}
		221
		222	youtube_in_google_rx = re.compile('.<div class="original-text">.href="(http://[^"]youtube.com/watch[^"]+).', re.S)
		223
		224	def make_scraper(url):
		225	scraper = scrapers.get(utils.domain(url), Scraper)
		226
		227	#sometimes youtube scrapers masquerade as google scrapers
		228	if scraper == GootubeScraper:
		229	h = Scraper(url)
		230	h.download()
		231	m = youtube_in_google_rx.match(h.content)
		232	if m:
		233	youtube_url = m.groups()[0]
		234	log.debug('%s is really %s' % (url, youtube_url))
		235	url = youtube_url
		236	return make_scraper(url)
		237	return scraper(url)
		238
		239	def test():
		240	from r2.lib.pool2 import WorkQueue
		241	jobs = []
		242	f = open('/tmp/testurls.txt')
		243	for url in f:
		244	if url.startswith('#'):
		245	continue
		246	if url.startswith('/info'):
		247	continue
		248
		249	def make_job(url):
		250	def fetch(url):
		251	print 'START', url
		252	url = url.strip()
		253	h = make_scraper(url)
		254	image_url = h.largest_image_url()
		255	print 'DONE', image_url
		256	return lambda: fetch(url)
		257
		258	jobs.append(make_job(url))
		259
		260	print jobs[0]()
		261	#wq = WorkQueue(jobs)
		262	#wq.start()
		263
		264	if __name__ == '__main__':
		265	test()