| | 1 | # The contents of this file are subject to the Common Public Attribution |
| | 2 | # License Version 1.0. (the "License"); you may not use this file except in |
| | 3 | # compliance with the License. You may obtain a copy of the License at |
| | 4 | # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public |
| | 5 | # License Version 1.1, but Sections 14 and 15 have been added to cover use of |
| | 6 | # software over a computer network and provide for limited attribution for the |
| | 7 | # Original Developer. In addition, Exhibit A has been modified to be consistent |
| | 8 | # with Exhibit B. |
| | 9 | # |
| | 10 | # Software distributed under the License is distributed on an "AS IS" basis, |
| | 11 | # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for |
| | 12 | # the specific language governing rights and limitations under the License. |
| | 13 | # |
| | 14 | # The Original Code is Reddit. |
| | 15 | # |
| | 16 | # The Original Developer is the Initial Developer. The Initial Developer of the |
| | 17 | # Original Code is CondeNet, Inc. |
| | 18 | # |
| | 19 | # All portions of the code written by CondeNet are Copyright (c) 2006-2008 |
| | 20 | # CondeNet, Inc. All Rights Reserved. |
| | 21 | ################################################################################ |
| | 22 | |
| | 23 | from pylons import g, config |
| | 24 | |
| | 25 | from r2.models.link import Link |
| | 26 | from r2.lib.workqueue import WorkQueue |
| | 27 | from r2.lib import s3cp |
| | 28 | from r2.lib.utils import timeago, fetch_things2 |
| | 29 | from r2.lib.db.operators import desc |
| | 30 | from r2.lib.scraper import make_scraper |
| | 31 | |
| | 32 | import tempfile |
| | 33 | from Queue import Queue |
| | 34 | |
| | 35 | s3_thumbnail_bucket = g.s3_thumb_bucket |
| | 36 | media_period = g.media_period |
| | 37 | threads = 20 |
| | 38 | log = g.log |
| | 39 | |
| | 40 | def thumbnail_url(link): |
| | 41 | """Given a link, returns the url for its thumbnail based on its fullname""" |
| | 42 | return 'http:/%s%s.png' % (s3_thumbnail_bucket, link._fullname) |
| | 43 | |
| | 44 | def upload_thumb(link, image): |
| | 45 | """Given a link and an image, uploads the image to s3 into an image |
| | 46 | based on the link's fullname""" |
| | 47 | f = tempfile.NamedTemporaryFile(suffix = '.png') |
| | 48 | image.save(f) |
| | 49 | |
| | 50 | resource = s3_thumbnail_bucket + link._fullname + '.png' |
| | 51 | log.debug('uploading to s3: %s' % link._fullname) |
| | 52 | s3cp.send_file(f.name, resource, 'image/png', 'public-read', None, False) |
| | 53 | log.debug('thumbnail %s: %s' % (link._fullname, thumbnail_url(link))) |
| | 54 | |
| | 55 | def make_link_info_job(results, link, useragent): |
| | 56 | """Returns a unit of work to send to a work queue that downloads a |
| | 57 | link's thumbnail and media object. Places the result in the results |
| | 58 | dict""" |
| | 59 | def job(): |
| | 60 | scraper = make_scraper(link.url) |
| | 61 | |
| | 62 | thumbnail = scraper.thumbnail() |
| | 63 | media_object = scraper.media_object() |
| | 64 | |
| | 65 | if thumbnail: |
| | 66 | upload_thumb(link, thumbnail) |
| | 67 | |
| | 68 | results[link] = (thumbnail, media_object) |
| | 69 | return job |
| | 70 | |
| | 71 | def update_link(link, thumbnail, media_object): |
| | 72 | """Sets the link's has_thumbnail and media_object attributes iin the |
| | 73 | database.""" |
| | 74 | if thumbnail: |
| | 75 | link.has_thumbnail = True |
| | 76 | |
| | 77 | if media_object: |
| | 78 | link.media_object = media_object |
| | 79 | |
| | 80 | link._commit() |
| | 81 | |
| | 82 | def process_new_links(period = media_period, force = False): |
| | 83 | """Fetches links from the last period and sets their media |
| | 84 | properities. If force is True, it will fetch properities for links |
| | 85 | even if the properties already exist""" |
| | 86 | links = Link._query(Link.c._date > timeago(period), sort = desc('_date'), |
| | 87 | data = True) |
| | 88 | results = {} |
| | 89 | jobs = [] |
| | 90 | for link in fetch_things2(links): |
| | 91 | if link.is_self: |
| | 92 | continue |
| | 93 | |
| | 94 | if not force and (link.has_thumbnail or link.media_object): |
| | 95 | continue |
| | 96 | |
| | 97 | jobs.append(make_link_info_job(results, link, g.useragent)) |
| | 98 | |
| | 99 | #send links to a queue |
| | 100 | wq = WorkQueue(jobs, num_workers = 20) |
| | 101 | wq.start() |
| | 102 | wq.jobs.join() |
| | 103 | |
| | 104 | #when the queue is finished, do the db writes in this thread |
| | 105 | for link, info in results.items(): |
| | 106 | update_link(link, info[0], info[1]) |
| | 107 | |
| | 108 | def set_media(link): |
| | 109 | """Sets the media properties for a single link.""" |
| | 110 | results = {} |
| | 111 | make_link_info_job(results, link, g.useragent)() |
| | 112 | update_link(link, *results[link]) |