| | 1 | # The contents of this file are subject to the Common Public Attribution |
| | 2 | # License Version 1.0. (the "License"); you may not use this file except in |
| | 3 | # compliance with the License. You may obtain a copy of the License at |
| | 4 | # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public |
| | 5 | # License Version 1.1, but Sections 14 and 15 have been added to cover use of |
| | 6 | # software over a computer network and provide for limited attribution for the |
| | 7 | # Original Developer. In addition, Exhibit A has been modified to be consistent |
| | 8 | # with Exhibit B. |
| | 9 | # |
| | 10 | # Software distributed under the License is distributed on an "AS IS" basis, |
| | 11 | # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for |
| | 12 | # the specific language governing rights and limitations under the License. |
| | 13 | # |
| | 14 | # The Original Code is Reddit. |
| | 15 | # |
| | 16 | # The Original Developer is the Initial Developer. The Initial Developer of the |
| | 17 | # Original Code is CondeNet, Inc. |
| | 18 | # |
| | 19 | # All portions of the code written by CondeNet are Copyright (c) 2006-2008 |
| | 20 | # CondeNet, Inc. All Rights Reserved. |
| | 21 | ################################################################################ |
| | 22 | |
| | 23 | from pylons import g |
| | 24 | from r2.lib import utils |
| | 25 | from r2.lib.memoize import memoize |
| | 26 | |
| | 27 | from urllib2 import Request, HTTPError, URLError, urlopen |
| | 28 | import urlparse, re, urllib, logging, StringIO, logging |
| | 29 | import Image, ImageFile |
| | 30 | |
| | 31 | log = g.log |
| | 32 | useragent = g.useragent |
| | 33 | |
| | 34 | chunk_size = 1024 |
| | 35 | thumbnail_size = 70, 70 |
| | 36 | |
| | 37 | def image_to_str(image): |
| | 38 | s = StringIO.StringIO() |
| | 39 | image.save(s, image.format) |
| | 40 | s.seek(0) |
| | 41 | return s.read() |
| | 42 | |
| | 43 | def str_to_image(s): |
| | 44 | s = StringIO.StringIO(s) |
| | 45 | s.seek(0) |
| | 46 | image = Image.open(s) |
| | 47 | return image |
| | 48 | |
| | 49 | @memoize('media.fetch_url') |
| | 50 | def fetch_url(url, referer = None, retries = 1, dimension = False): |
| | 51 | cur_try = 0 |
| | 52 | #log.debug('fetching: %s' % url) |
| | 53 | nothing = None if dimension else (None, None) |
| | 54 | while True: |
| | 55 | try: |
| | 56 | req = Request(url) |
| | 57 | if useragent: |
| | 58 | req.add_header('User-Agent', useragent) |
| | 59 | if referer: |
| | 60 | req.add_header('Referer', referer) |
| | 61 | |
| | 62 | open_req = urlopen(req) |
| | 63 | |
| | 64 | #if we only need the dimension of the image, we may not |
| | 65 | #need the entire image |
| | 66 | if dimension: |
| | 67 | content = open_req.read(chunk_size) |
| | 68 | else: |
| | 69 | content = open_req.read() |
| | 70 | content_type = open_req.headers.get('content-type') |
| | 71 | |
| | 72 | if 'image' in content_type: |
| | 73 | p = ImageFile.Parser() |
| | 74 | new_data = content |
| | 75 | while not p.image and new_data: |
| | 76 | p.feed(new_data) |
| | 77 | new_data = open_req.read(chunk_size) |
| | 78 | content += new_data |
| | 79 | |
| | 80 | #return the size, or return the data |
| | 81 | if dimension and p.image: |
| | 82 | return p.image.size |
| | 83 | elif dimension: |
| | 84 | return nothing |
| | 85 | elif dimension: |
| | 86 | #expected an image, but didn't get one |
| | 87 | return nothing |
| | 88 | |
| | 89 | return content_type, content |
| | 90 | |
| | 91 | except (URLError, HTTPError), e: |
| | 92 | cur_try += 1 |
| | 93 | if cur_try >= retries: |
| | 94 | log.debug('error while fetching: %s referer: %s' % (url, referer)) |
| | 95 | log.debug(e) |
| | 96 | return nothing |
| | 97 | finally: |
| | 98 | if 'open_req' in locals(): |
| | 99 | open_req.close() |
| | 100 | |
| | 101 | img_rx = re.compile(r'<\s*(?:img)[^>]*src\s*=\s*[\"\']?([^\"\'\s>]*)[^>]*', re.IGNORECASE | re.S) |
| | 102 | def image_urls(base_url, html): |
| | 103 | for match in img_rx.findall(html): |
| | 104 | image_url = urlparse.urljoin(base_url, match) |
| | 105 | yield image_url |
| | 106 | |
| | 107 | class Scraper: |
| | 108 | def __init__(self, url): |
| | 109 | self.url = url |
| | 110 | self.content = None |
| | 111 | self.content_type = None |
| | 112 | |
| | 113 | def download(self): |
| | 114 | self.content_type, self.content = fetch_url(self.url) |
| | 115 | |
| | 116 | def largest_image_url(self): |
| | 117 | if not self.content: |
| | 118 | self.download() |
| | 119 | |
| | 120 | #if download didn't work |
| | 121 | if not self.content: |
| | 122 | return None |
| | 123 | |
| | 124 | max_area = 0 |
| | 125 | max_url = None |
| | 126 | |
| | 127 | #if the original url was an image, use that |
| | 128 | if 'image' in self.content_type: |
| | 129 | urls = [self.url] |
| | 130 | else: |
| | 131 | urls = image_urls(self.url, self.content) |
| | 132 | |
| | 133 | for image_url in urls: |
| | 134 | size = fetch_url(image_url, referer = self.url, dimension = True) |
| | 135 | if not size: |
| | 136 | continue |
| | 137 | |
| | 138 | area = size[0] * size[1] |
| | 139 | |
| | 140 | #ignore little images |
| | 141 | if area < 5000: |
| | 142 | log.debug('ignore little %s' % image_url) |
| | 143 | continue |
| | 144 | |
| | 145 | #ignore excessively long/wide images |
| | 146 | if max(size) / min(size) > 1.5: |
| | 147 | log.debug('ignore dimensions %s' % image_url) |
| | 148 | continue |
| | 149 | |
| | 150 | if area > max_area: |
| | 151 | max_area = area |
| | 152 | max_url = image_url |
| | 153 | |
| | 154 | return max_url |
| | 155 | |
| | 156 | def thumbnail(self): |
| | 157 | image_url = self.largest_image_url() |
| | 158 | if image_url: |
| | 159 | content_type, image_str = fetch_url(image_url, referer = self.url) |
| | 160 | if image_str: |
| | 161 | image = str_to_image(image_str) |
| | 162 | image.thumbnail(thumbnail_size, Image.ANTIALIAS) |
| | 163 | return image |
| | 164 | |
| | 165 | def media_object(self): |
| | 166 | return None |
| | 167 | |
| | 168 | youtube_rx = re.compile('.*v=([A-Za-z0-9-_]+).*') |
| | 169 | |
| | 170 | class YoutubeScraper(Scraper): |
| | 171 | media_template = '<object width="425" height="350"><param name="movie"
value="http://www.youtube.com/v/%s"></param><param name="wmode" value="transparent"></param><embed
src="http://www.youtube.com/v/%s" type="application/x-shockwave-flash" wmode="transparent" width="425"
height="350"></embed></object>' |
| | 172 | |
| | 173 | def __init__(self, url): |
| | 174 | m = youtube_rx.match(url) |
| | 175 | if m: |
| | 176 | self.video_id = m.groups()[0] |
| | 177 | else: |
| | 178 | #if it's not a youtube video, just treat it like a normal page |
| | 179 | log.debug('reverting youtube to regular scraper: %s' % url) |
| | 180 | self.__class__ = Scraper |
| | 181 | |
| | 182 | Scraper.__init__(self, url) |
| | 183 | |
| | 184 | def largest_image_url(self): |
| | 185 | return 'http://img.youtube.com/vi/%s/default.jpg' % self.video_id |
| | 186 | |
| | 187 | def media_object(self): |
| | 188 | return self.media_template % (self.video_id, self.video_id) |
| | 189 | |
| | 190 | gootube_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*') |
| | 191 | gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S) |
| | 192 | |
| | 193 | class GootubeScraper(Scraper): |
| | 194 | media_template = '<embed style="width:400px; height:326px;" id="VideoPlayback" type="application/x-shockwave-flash"
src="http://video.google.com/googleplayer.swf?docId=%s&hl=en" flashvars=""> </embed>' |
| | 195 | def __init__(self, url): |
| | 196 | m = gootube_rx.match(url) |
| | 197 | if m: |
| | 198 | self.video_id = m.groups()[0] |
| | 199 | else: |
| | 200 | self.__class__ = Scraper |
| | 201 | Scraper.__init__(self, url) |
| | 202 | |
| | 203 | def largest_image_url(self): |
| | 204 | if not self.content: |
| | 205 | self.download() |
| | 206 | |
| | 207 | if not self.content: |
| | 208 | return None |
| | 209 | |
| | 210 | m = gootube_thumb_rx.match(self.content) |
| | 211 | if m: |
| | 212 | image_url = m.groups()[0] |
| | 213 | image_url = utils.safe_eval_str(image_url) |
| | 214 | return image_url |
| | 215 | |
| | 216 | def media_object(self): |
| | 217 | return self.media_template % self.video_id |
| | 218 | |
| | 219 | scrapers = {'youtube.com': YoutubeScraper, |
| | 220 | 'video.google.com': GootubeScraper} |
| | 221 | |
| | 222 | youtube_in_google_rx = re.compile('.*<div class="original-text">.*href="(http://[^"]*youtube.com/watch[^"]+).*', re.S) |
| | 223 | |
| | 224 | def make_scraper(url): |
| | 225 | scraper = scrapers.get(utils.domain(url), Scraper) |
| | 226 | |
| | 227 | #sometimes youtube scrapers masquerade as google scrapers |
| | 228 | if scraper == GootubeScraper: |
| | 229 | h = Scraper(url) |
| | 230 | h.download() |
| | 231 | m = youtube_in_google_rx.match(h.content) |
| | 232 | if m: |
| | 233 | youtube_url = m.groups()[0] |
| | 234 | log.debug('%s is really %s' % (url, youtube_url)) |
| | 235 | url = youtube_url |
| | 236 | return make_scraper(url) |
| | 237 | return scraper(url) |
| | 238 | |
| | 239 | def test(): |
| | 240 | from r2.lib.pool2 import WorkQueue |
| | 241 | jobs = [] |
| | 242 | f = open('/tmp/testurls.txt') |
| | 243 | for url in f: |
| | 244 | if url.startswith('#'): |
| | 245 | continue |
| | 246 | if url.startswith('/info'): |
| | 247 | continue |
| | 248 | |
| | 249 | def make_job(url): |
| | 250 | def fetch(url): |
| | 251 | print 'START', url |
| | 252 | url = url.strip() |
| | 253 | h = make_scraper(url) |
| | 254 | image_url = h.largest_image_url() |
| | 255 | print 'DONE', image_url |
| | 256 | return lambda: fetch(url) |
| | 257 | |
| | 258 | jobs.append(make_job(url)) |
| | 259 | |
| | 260 | print jobs[0]() |
| | 261 | #wq = WorkQueue(jobs) |
| | 262 | #wq.start() |
| | 263 | |
| | 264 | if __name__ == '__main__': |
| | 265 | test() |