7bf5fab1fa2fa07cc012c0570781aea23343ae05bcca862eaaeeb4e5a739a9107f8ae073ef6a37b3
 
 
1
# The contents of this file are subject to the Common Public Attribution
 
 
2
# License Version 1.0. (the "License"); you may not use this file except in
 
 
3
# compliance with the License. You may obtain a copy of the License at
 
 
4
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
 
 
5
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
 
 
6
# software over a computer network and provide for limited attribution for the
 
 
7
# Original Developer. In addition, Exhibit A has been modified to be consistent
 
 
8
# with Exhibit B.
 
 
9
# 
 
 
10
# Software distributed under the License is distributed on an "AS IS" basis,
 
 
11
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
 
 
12
# the specific language governing rights and limitations under the License.
 
 
13
# 
 
 
14
# The Original Code is Reddit.
 
 
15
# 
 
 
16
# The Original Developer is the Initial Developer.  The Initial Developer of the
 
 
17
# Original Code is CondeNet, Inc.
 
 
18
# 
 
 
19
# All portions of the code written by CondeNet are Copyright (c) 2006-2008
 
 
20
# CondeNet, Inc. All Rights Reserved.
 
 
21
################################################################################
 
 
22
 
 
 
23
from pylons import g
 
 
24
from r2.lib import utils
 
 
25
from r2.lib.memoize import memoize
 
 
26
 
 
 
27
from urllib2 import Request, HTTPError, URLError, urlopen
 
 
28
import urlparse, re, urllib, logging, StringIO, logging
 
 
29
import Image, ImageFile
 
 
30
 
 
 
31
log = g.log
 
 
32
useragent = g.useragent
 
 
33
 
 
 
34
chunk_size = 1024
 
 
35
thumbnail_size = 70, 70
 
 
36
 
 
 
37
def image_to_str(image):
 
 
38
    s = StringIO.StringIO()
 
 
39
    image.save(s, image.format)
 
 
40
    s.seek(0)
 
 
41
    return s.read()
 
 
42
 
 
 
43
def str_to_image(s):
 
 
44
    s = StringIO.StringIO(s)
 
 
45
    s.seek(0)
 
 
46
    image = Image.open(s)
 
 
47
    return image
 
 
48
 
 
 
49
@memoize('media.fetch_url')
 
 
50
def fetch_url(url, referer = None, retries = 1, dimension = False):
 
 
51
    cur_try = 0
 
 
52
    #log.debug('fetching: %s' % url)
 
 
53
    nothing = None if dimension else (None, None)
 
 
54
    while True:
 
 
55
        try:
 
 
56
            req = Request(url)
 
 
57
            if useragent:
 
 
58
                req.add_header('User-Agent', useragent)
 
 
59
            if referer:
 
 
60
                req.add_header('Referer', referer)
 
 
61
 
 
 
62
            open_req = urlopen(req)
 
 
63
 
 
 
64
            #if we only need the dimension of the image, we may not
 
 
65
            #need the entire image
 
 
66
            if dimension:
 
 
67
                content = open_req.read(chunk_size)
 
 
68
            else:
 
 
69
                content = open_req.read()
 
 
70
            content_type = open_req.headers.get('content-type')
 
 
71
 
 
 
72
            if 'image' in content_type:
 
 
73
                p = ImageFile.Parser()
 
 
74
                new_data = content
 
 
75
                while not p.image and new_data:
 
 
76
                    p.feed(new_data)
 
 
77
                    new_data = open_req.read(chunk_size)
 
 
78
                    content += new_data
 
 
79
 
 
 
80
                #return the size, or return the data
 
 
81
                if dimension and p.image:
 
 
82
                    return p.image.size
 
 
83
                elif dimension:
 
 
84
                    return nothing
 
 
85
            elif dimension:
 
 
86
                #expected an image, but didn't get one
 
 
87
                return nothing
 
 
88
 
 
 
89
            return content_type, content
 
 
90
 
 
 
91
        except (URLError, HTTPError), e:
 
 
92
            cur_try += 1
 
 
93
            if cur_try >= retries:
 
 
94
                log.debug('error while fetching: %s referer: %s' % (url, referer))
 
 
95
                log.debug(e)
 
 
96
                return nothing
 
 
97
        finally:
 
 
98
            if 'open_req' in locals():
 
 
99
                open_req.close()
 
 
100
 
 
 
101
img_rx = re.compile(r'<\s*(?:img)[^>]*src\s*=\s*[\"\']?([^\"\'\s>]*)[^>]*', re.IGNORECASE | re.S) 
 
 
102
def image_urls(base_url, html):
 
 
103
    for match in img_rx.findall(html):
 
 
104
        image_url = urlparse.urljoin(base_url, match)
 
 
105
        yield image_url
 
 
106
 
 
 
107
class Scraper:
 
 
108
    def __init__(self, url):
 
 
109
        self.url = url
 
 
110
        self.content = None
 
 
111
        self.content_type = None
 
 
112
 
 
 
113
    def download(self):
 
 
114
        self.content_type, self.content = fetch_url(self.url)
 
 
115
 
 
 
116
    def largest_image_url(self):
 
 
117
        if not self.content:
 
 
118
            self.download()
 
 
119
 
 
 
120
        #if download didn't work
 
 
121
        if not self.content:
 
 
122
            return None
 
 
123
 
 
 
124
        max_area = 0
 
 
125
        max_url = None
 
 
126
 
 
 
127
        #if the original url was an image, use that
 
 
128
        if 'image' in self.content_type:
 
 
129
            urls = [self.url]
 
 
130
        else:
 
 
131
            urls = image_urls(self.url, self.content)
 
 
132
 
 
 
133
        for image_url in urls:
 
 
134
            size = fetch_url(image_url, referer = self.url, dimension = True)
 
 
135
            if not size:
 
 
136
                continue
 
 
137
 
 
 
138
            area = size[0] * size[1]
 
 
139
 
 
 
140
            #ignore little images
 
 
141
            if area < 5000:
 
 
142
                log.debug('ignore little %s' % image_url)
 
 
143
                continue
 
 
144
 
 
 
145
            #ignore excessively long/wide images
 
 
146
            if max(size) / min(size) > 1.5:
 
 
147
                log.debug('ignore dimensions %s' % image_url)
 
 
148
                continue
 
 
149
 
 
 
150
            if area > max_area:
 
 
151
                max_area = area
 
 
152
                max_url = image_url
 
 
153
 
 
 
154
        return max_url
 
 
155
 
 
 
156
    def thumbnail(self):
 
 
157
        image_url = self.largest_image_url()
 
 
158
        if image_url:
 
 
159
            content_type, image_str = fetch_url(image_url, referer = self.url)
 
 
160
            if image_str:
 
 
161
                image = str_to_image(image_str)
 
 
162
                image.thumbnail(thumbnail_size, Image.ANTIALIAS)
 
 
163
                return image
 
 
164
 
 
 
165
    def media_object(self):
 
 
166
        return None
 
 
167
 
 
 
168
youtube_rx = re.compile('.*v=([A-Za-z0-9-_]+).*')
 
 
169
 
 
 
170
class YoutubeScraper(Scraper):
 
 
171
    media_template = '<object width="425" height="350"><param name="movie"
value="http://www.youtube.com/v/%s"></param><param name="wmode" value="transparent"></param><embed
src="http://www.youtube.com/v/%s" type="application/x-shockwave-flash" wmode="transparent" width="425"
height="350"></embed></object>'
 
 
172
 
 
 
173
    def __init__(self, url):
 
 
174
        m = youtube_rx.match(url)
 
 
175
        if m:
 
 
176
            self.video_id = m.groups()[0]
 
 
177
        else:
 
 
178
            #if it's not a youtube video, just treat it like a normal page
 
 
179
            log.debug('reverting youtube to regular scraper: %s' % url)
 
 
180
            self.__class__ = Scraper
 
 
181
 
 
 
182
        Scraper.__init__(self, url)
 
 
183
 
 
 
184
    def largest_image_url(self):
 
 
185
         return 'http://img.youtube.com/vi/%s/default.jpg' % self.video_id
 
 
186
 
 
 
187
    def media_object(self):
 
 
188
        return self.media_template % (self.video_id, self.video_id)
 
 
189
 
 
 
190
gootube_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*')
 
 
191
gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S)
 
 
192
 
 
 
193
class GootubeScraper(Scraper):
 
 
194
    media_template = '<embed style="width:400px; height:326px;" id="VideoPlayback" type="application/x-shockwave-flash"
src="http://video.google.com/googleplayer.swf?docId=%s&hl=en" flashvars=""> </embed>'
 
 
195
    def __init__(self, url):
 
 
196
        m = gootube_rx.match(url)
 
 
197
        if m:
 
 
198
            self.video_id = m.groups()[0]
 
 
199
        else:
 
 
200
            self.__class__ = Scraper
 
 
201
        Scraper.__init__(self, url)
 
 
202
 
 
 
203
    def largest_image_url(self):
 
 
204
        if not self.content:
 
 
205
            self.download()
 
 
206
 
 
 
207
        if not self.content:
 
 
208
            return None
 
 
209
 
 
 
210
        m = gootube_thumb_rx.match(self.content)
 
 
211
        if m:
 
 
212
            image_url = m.groups()[0]
 
 
213
            image_url = utils.safe_eval_str(image_url)
 
 
214
            return image_url
 
 
215
 
 
 
216
    def media_object(self):
 
 
217
        return self.media_template % self.video_id
 
 
218
 
 
 
219
scrapers = {'youtube.com': YoutubeScraper,
 
 
220
            'video.google.com': GootubeScraper}
 
 
221
 
 
 
222
youtube_in_google_rx = re.compile('.*<div class="original-text">.*href="(http://[^"]*youtube.com/watch[^"]+).*', re.S)
 
 
223
 
 
 
224
def make_scraper(url):
 
 
225
    scraper = scrapers.get(utils.domain(url), Scraper)
 
 
226
 
 
 
227
    #sometimes youtube scrapers masquerade as google scrapers
 
 
228
    if scraper == GootubeScraper:
 
 
229
        h = Scraper(url)
 
 
230
        h.download()
 
 
231
        m = youtube_in_google_rx.match(h.content)
 
 
232
        if m:
 
 
233
            youtube_url = m.groups()[0]
 
 
234
            log.debug('%s is really %s' % (url, youtube_url))
 
 
235
            url = youtube_url
 
 
236
            return make_scraper(url)
 
 
237
    return scraper(url)
 
 
238
 
 
 
239
def test():
 
 
240
    from r2.lib.pool2 import WorkQueue
 
 
241
    jobs = []
 
 
242
    f = open('/tmp/testurls.txt')
 
 
243
    for url in f:
 
 
244
        if url.startswith('#'):
 
 
245
            continue
 
 
246
        if url.startswith('/info'):
 
 
247
            continue
 
 
248
 
 
 
249
        def make_job(url):
 
 
250
            def fetch(url):
 
 
251
                print 'START', url
 
 
252
                url = url.strip()
 
 
253
                h = make_scraper(url)
 
 
254
                image_url = h.largest_image_url()
 
 
255
                print 'DONE', image_url
 
 
256
            return lambda: fetch(url)
 
 
257
 
 
 
258
        jobs.append(make_job(url))
 
 
259
 
 
 
260
    print jobs[0]()
 
 
261
    #wq = WorkQueue(jobs)
 
 
262
    #wq.start()            
 
 
263
 
 
 
264
if __name__ == '__main__':
 
 
265
    test()