Changeset 8a084e970bdeeb8014b2211ea1eee11a34faa9ff

User picture

Commiter: Jérémie Roquet

Author: Jérémie Roquet

Parent: 569d131db8

(2010/01/24 12:20) Over 2 years ago

* Arkbot
 - Additional tasks
* Proto
 - First prototype that works

Affected files

Updated arkbot/task.py Download diff

569d131db81c90b9771c0148981c10cf7ec83b048a084e970bdeeb8014b2211ea1eee11a34faa9ff
8
	def run(self):
8
	def run(self):
9
		raise NotImplementedError()
9
		raise NotImplementedError()
10
10
11
class Loop(Task):
12
13
	def __init__(self, task):
14
		super(Loop, self).__init__()
15
		self.__task = task
16
17
	def run(self):
18
		while True:
19
			self.__task()
20
21
class Sequential(Task):
22
23
	def __init__(self, *tasks):
24
		super(Sequential, self).__init__()
25
		self.__tasks = list(tasks)
26
27
	def run(self):
28
		for task in self.__tasks:
29
			task.run()
30
11
class Parallel(Task):
31
class Parallel(Task):
12
32
13
	def __init__(self, *tasks):
33
	def __init__(self, *tasks):

Added LICENSE Download diff

569d131db81c90b9771c0148981c10cf7ec83b048a084e970bdeeb8014b2211ea1eee11a34faa9ff
1
(C) 2009-2010 Arkanosis <arkanosis@gmail.com>
2
3
Permission is hereby granted, free of charge, to any person obtaining a copy
4
of this software and associated documentation files (the "Software"), to deal
5
in the Software without restriction, including without limitation the rights
6
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
copies of the Software, and to permit persons to whom the Software is
8
furnished to do so, subject to the following conditions:
9
10
The above copyright notice and this permission notice shall be included in
11
all copies or substantial portions of the Software.
12
13
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
THE SOFTWARE.

Added mediawiki.py Download diff

569d131db81c90b9771c0148981c10cf7ec83b048a084e970bdeeb8014b2211ea1eee11a34faa9ff
1
# -*- coding: utf-8 -*-
2
3
# Documentation about the API is available at:
4
# http://www.mediawiki.org/wiki/API
5
6
class Session(object):
7
8
	def __init__(self):
9
		self.__id = None
10
		self.__userid = None
11
		self.__username = None
12
		self.__token = None
13
		self.__prefix = None
14
15
	def start(self, username, password):
16
		# TODO POST request
17
		pass
18
19
	def stop(self):
20
		# TODO request
21
		pass
22
23
class Request(object):
24
25
	def __init__(self):
26
		pass
27
28
	def __send(self, session):
29
		# TODO request
30
		pass
31
32
	def __call__(self, session):
33
		pass
34
35
class Query(Request):
36
37
	def __init__(self, page):
38
		super(Query, self).__init__()
39
		self.__page = page
40
41
	def __call__(self, session):
42
		# TODO build request
43
		self.__send(session)

Added proto/arkbot.py Download diff

569d131db81c90b9771c0148981c10cf7ec83b048a084e970bdeeb8014b2211ea1eee11a34faa9ff
1
#! /bin/env python2.7
2
# -*- coding: utf-8 -*-
3
4
# Arkbot (prototype)
5
# (C) 2009-2010 Arkanosis
6
# arkanosis@gmail.com
7
8
# Ce bot est un *prototype* pour Arkbot et n'est pas destiné à un usage en production
9
# http://github.com/Arkanosis/Wikipedia/arkbot (prototype)
10
# http://trac-git.assembla.com/arkbot/ (version architecturée)
11
12
# Ce bot est mis à disposition sous licence MIT
13
# http://www.opensource.org/licenses/mit-license.php
14
15
16
# [ Tâches ]
17
# - Conversion des règles de Salebot vers Abusefilter
18
# - Dépouillage automatique des votes de Contorcet / Schultze avec publication des résultats sur une page donnée
19
# - Recherche de fautes d'orthographe / grammaire courantes, d'expressions non neutres...
20
# - Génération automatique et remplissage des infoboxes / catégories à partir du texte et des interwikis
21
# - Recherche des articles déblanchis et des bandeaux admissibilité / suppression retirés par IP / newbie
22
# - Revert de toutes les contributions d'un vandale / spammeur
23
# - Recherche de copyvio (cf. copyright.py dans pywikipedia et équivalents sur en)
24
# - Recherche de spam (liens externes sous ip ou nouvel utilisateur)
25
# - Lutte contre les vandales (cf. Salebot et équivalents sur en., IA...)
26
27
# [ Notes ]
28
# - action=render pour n'avoir que la page (pas les menus, header, footer...)
29
30
# [ Utile ]
31
# - http://www.mediawiki.org/wiki/Manual:Parameters_to_index.php
32
# - http://www.mediawiki.org/wiki/API
33
34
import getpass
35
import gzip
36
import json
37
import logging
38
import logging.handlers
39
import os
40
import re
41
import string
42
import StringIO
43
import sys
44
import subprocess
45
import tempfile
46
import urllib
47
import urllib2
48
import xml.dom.minidom
49
50
_userName = 'Arkbot'
51
52
_version = '0.1 pre-alpha'
53
_userAgent = 'Arkbot/' + _version
54
55
_diff = 'gvimdiff'
56
57
_lang = 'fr'
58
_wiki = '%s.wikipedia.org'
59
_apiUrl = '/w/api.php?assert=user&'
60
_rawUrl = '/w/index.php?'
61
_searchUrl = _rawUrl + 'title=Spécial:Recherche&search='
62
63
_getHeaders = {
64
	'Accept-encoding': 'gzip',
65
	'User-Agent': _userAgent,
66
}
67
68
_postHeaders = {
69
	'Content-type': 'application/x-www-form-urlencoded',
70
	'Accept': 'text/plain',
71
	'Accept-encoding': 'gzip',
72
	'User-Agent': _userAgent,
73
}
74
75
_internalLink = re.compile(r'\[\[[^\]]+\]\]')
76
77
_categoryNamespace = {
78
	'en': 'category',
79
	'fr': 'catégorie',
80
	# TODO dump the different values
81
}
82
83
class ArkbotException(Exception):
84
	def __init__(self, reason):
85
		super(ArkbotException, self).__init__(reason)
86
87
class HttpException(ArkbotException):
88
	def __init__(self, reason):
89
		super(HttpException, self).__init__(reason)
90
91
class ApiException(ArkbotException):
92
	def __init__(self, reason):
93
		super(ApiException, self).__init__(reason)
94
95
class PageNotFoundException(ArkbotException):
96
	def __init__(self, reason):
97
		super(PageNotFoundException, self).__init__(reason)
98
99
class BadPasswordException(ArkbotException):
100
	pass
101
102
class ApiResponse(object):
103
	def __init__(self, dictionary):
104
		entries = {}
105
		for item in dictionary:
106
			try:
107
				int(item)
108
				if isinstance(dictionary[item], dict):
109
					entries[item] = ApiResponse(dictionary[item])
110
				else:
111
					entries[item] = dictionary[item]
112
			except ValueError:
113
				if not item.startswith('__') and isinstance(dictionary[item], dict):
114
					dictionary[item] = ApiResponse(dictionary[item])
115
		self.__dict__ = dictionary
116
		self.__entries = entries
117
118
	def __getattr__(self, attribute):
119
		return None
120
121
	def __repr__(self):
122
		return repr(self.__dict__)
123
124
	def values(self):
125
		return self.__entries.values()
126
127
class Arkbot(object):
128
129
	def __init__(self, name, site, logger):
130
		self.__logger = logger
131
		self.__name = name
132
		self.__userId = None
133
		self.__token = None
134
		self.__session = None
135
		self.__connect(site)
136
137
	def __connect(self, site):
138
		self.__site = site
139
		self.__opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
140
		urllib2.install_opener(self.__opener)
141
142
	def __request(self, url, data=None, headers=_getHeaders, lang=_lang):
143
		try:
144
			self.__logger.debug('Requesting http://' + (self.__site % lang) + url)
145
			response = self.__opener.open(urllib2.Request('http://' + (self.__site % lang) + url,  data, headers))
146
			for header, value in response.headers.items():
147
				if header.lower() == 'content-encoding' and value == 'gzip':
148
					response = gzip.GzipFile(fileobj=StringIO.StringIO(response.read()))
149
					break
150
			return response # TODO response.close() somewhere?
151
		except urllib2.HTTPError, e:
152
			raise HttpException('%s: (on query "%s")' % (e.code, url))
153
154
	def __handleApiResponse(self, response, query, noReturn=False):
155
		jsonResponse = json.load(response)
156
		if not jsonResponse:
157
			if noReturn:
158
				return None
159
			raise PageNotFoundException('Page not found (on query "%s")' % query)
160
		apiResponse = ApiResponse(jsonResponse)
161
		if apiResponse.error:
162
			raise ApiException('%s: %s (on query "%s")' % (apiResponse.error.code, apiResponse.error.info, query))
163
		return apiResponse
164
165
	def __internalLinks(self, text):
166
		return [link[2:-2] for link in re.findall(_internalLink, text)]
167
168
	def __categories(self, links, lang):
169
		return filter(lambda link: link.startWith(_categoryNamespace[lang] + ':'), links)
170
171
	def __interWikis(self, links):
172
		return filter(lambda link: 0 < link.find(':') < 4 or link.startswith('simple:') or link.startswith('tokipona:'), links)
173
174
	def __search(self, query, *args, **kwargs):
175
		query = query.replace(' ', '+')
176
		for arg in kwargs.items():
177
			query += '&%s=%s' % arg
178
		document = xml.dom.minidom.parse(self.__request(_searchUrl + query))
179
		results = []
180
		for unorderedList in document.getElementsByTagName('ul'):
181
			if unorderedList.getAttribute('class').find('mw-search-results') != -1:
182
				for result in unorderedList.getElementsByTagName('li'):
183
					results.append(result.firstChild.firstChild.nodeValue.encode('utf8'))
184
				break
185
		document.unlink()
186
		return results
187
188
	def __fetch(self, page, lang=_lang):
189
		return self.__request(_rawUrl + 'action=raw&title=' + page.replace(' ', '_'), lang=lang).read()
190
191
	def __get(self, action='query', *args, **kwargs):
192
		query = 'action=%s&' % action
193
		for arg in kwargs.items():
194
			if arg[1]:
195
				query += '%s=%s&' % arg
196
		query += 'format=json'
197
		return self.__handleApiResponse(self.__request(_apiUrl + query.replace(' ', '_')), query).query
198
199
	def __post(self, noReturn=False, *args, **kwargs):
200
		kwargs['format'] = 'json'
201
		query = {}
202
		for parameter, value in kwargs.items():
203
			if kwargs[parameter]:
204
				query[parameter] = value
205
		query = urllib.urlencode(query)
206
		return self.__handleApiResponse(self.__request(_apiUrl, query, _postHeaders), query, noReturn=noReturn)
207
208
	def __shouldStop(self):
209
		response = self.__get(meta='userinfo', uiprop='hasmsg')
210
		if response.userinfo.anon is not None:
211
			return True
212
		return False
213
214
	def __confirm(self, page, oldText, newText, summary):
215
		def dumpVersion(text):
216
			with tempfile.NamedTemporaryFile(delete=False) as dump:
217
				dump.write(text)
218
				return dump.name
219
		old = dumpVersion(oldText)
220
		new = dumpVersion(newText)
221
		subprocess.call([_diff, old, new])
222
		while True:
223
			answer = raw_input('Edit "%s" with summary "%s"? ' % (page, summary))
224
			if answer == 'y':
225
				return True
226
			elif answer == 'n':
227
				return False
228
		os.unlink(old)
229
		os.unlink(new)
230
231
	def __clean(self, text):
232
		# TODO
233
		# http://wikipedia/art => art
234
		# [[foo_bar]] => [[foo bar]]
235
		# == foo === => == Foo ==
236
		# suppr. <ref>wikipedia</ref>
237
		# suppr [[autolien]] ou [[auto-redir]]
238
		# ajout <references /> si <ref>
239
		# {{ISBN}}
240
		# {{formatnum:}}
241
		# {{siècle}}
242
		# {{date}}
243
		# [[Image: // [[Fichier: ...
244
		# fixes.py et cosmetic_change.py de pywikipedia
245
		return text
246
247
	def login(self, password):
248
		apiResponse = self.__post(action='login', lgname=self.__name, lgpassword=password)
249
		if not apiResponse.login:
250
			raise ApiException('%s (on login)' % apiResponse)
251
		if apiResponse.login.result in ['NoName', 'NotExists', 'Illegal']:
252
			raise ApiException('Bad user name (on login)')
253
		if apiResponse.login.result in ['EmptyPass', 'WrongPass']:
254
			raise ApiException('Bad password (on login)')
255
		if apiResponse.login.result == 'Throttled':
256
			raise ApiException('Too many login attempts, please wait for %s seconds and retry (on login)' % apiResponse.login.wait)
257
258
	def logout(self):
259
		self.__post(action='logout', noReturn=True)
260
261
	def info(self, *pages):
262
		return self.__get(titles=string.join(pages, '|'))
263
264
	def read(self, page):
265
		return self.__fetch(page)
266
267
	def edit(self, page, summary, text, minor=False, bot=False, oldText=None):
268
		text = self.__clean(text)
269
		summary = 'bot : ' + summary
270
		self.__logger.info('Editing page "%s" with summary "%s"' % (page, summary))
271
		if oldText and not self.__confirm(page, oldText, text, summary):
272
			self.__logger.info('  => edition not confirmed by user')
273
			return
274
		if self.__shouldStop():
275
			raise ApiException('The bot has been asked to stop editing the wiki, or is not logged in anymore')
276
		pageInfo = self.__get(titles=page, prop='info|revisions', intoken='edit').pages.values()[0]
277
		apiResponse = self.__post(action='edit', title=page, summary=summary, text=text, token=pageInfo.edittoken, basetimestamp=pageInfo.starttimestamp, minor=minor,
bot=bot)
278
		if apiResponse.edit.result != 'Success':
279
			raise ApiException('Unable to edit page "%s" with summary "%s" and text "%s"' % (page, summary, text))
280
281
	def replace(self, page, pattern, replacement, summary=None, reason='', minor=False, bot=False, confirm=True):
282
		if not summary:
283
			summary = 'Remplacement de "%s" par "%s"' % (pattern, replacement)
284
			if reason:
285
				summary += ' (%s)' % reason
286
		self.__logger.info('Replacing "%s" with "%s" with summary "%s" on page %s"' % (pattern, replacement, summary, page))
287
		oldText = self.read(page)
288
		newText = re.sub(pattern, replacement, oldText)
289
		if newText == oldText:
290
			self.__logger.info('  => no change after replacement')
291
			return
292
		if not confirm:
293
			oldText = None
294
		self.edit(page, summary, newText, minor, bot, oldText)
295
296
	def consolidate(self, page):
297
		text = {
298
			_lang: self.__fetch(page)
299
		}
300
		links = self.__internalLinks(text.values()[0])
301
302
		oldInterWikis = self.__interWikis(links)
303
		newInterWikis = oldInterWikis[:]
304
305
		# Consolidate interWikis, and fetch all texts
306
		while oldInterWikis:
307
			for interWiki in oldInterWikis:
308
				cut = interWiki.find(':')
309
				lang = interWiki[:cut]
310
				text[lang] = self.__fetch(interWiki[cut + 1:], lang)
311
				otherLinks = self.__internalLinks(text[lang])
312
				otherInterWikis = self.__interWikis(otherLinks)
313
				for otherInterWiki in otherInterWikis:
314
					if otherInterWiki not in newInterWikis and not otherInterWiki.startswith(_lang + ':'):
315
						newInterWikis.append(otherInterWiki)
316
			oldInterWikis = [interWiki for interWiki in newInterWikis if interWiki not in oldInterWikis]
317
		newInterWikis.sort()
318
319
		# Extract infoboxes
320
		# TODO iter on all texts, extract infoboxes if available, extract informations from the infoboxes
321
322
		# Extract categories
323
		# TODO iter on all texts, extract categories if available
324
325
		# Extract portals if availables
326
		# TODO iter on all texts, extract portals if available
327
328
		# Extract introduction paragraph if available
329
		# TODO iter on all texts, extract introduction paragraph, extract informations from the paragraph
330
331
		# Infer stub if applicable
332
		# TODO if the text is very short, add a stub banner with the correct theme (from the portals, then categories)
333
334
		# Insert informations
335
		# TODO rewrite the introduction paragraph
336
		# TODO add the infoboxes with the right values
337
		# TODO add the portals
338
		# TODO add the categories
339
		# TODO add the interwikis
340
341
		print newInterWikis
342
343
	def interwikis(self, page):
344
		return self.__interWikis(self.__internalLinks(self.__fetch(page)))
345
346
	def search(self, query, *args, **kwargs):
347
		self.__logger.info('Searching for "%s" with parameters %s' % (query, kwargs))
348
		return self.__search(query, *args, **kwargs)
349
350
def main():
351
	print 'Arkbot %s (prototype)' % _version
352
	print '(C) 2009-2010 Arkanosis'
353
	print 'arkanosis@gmail.com'
354
	print
355
356
	login = False
357
	logger = logging.getLogger('ArkbotLogger')
358
	formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', '%H:%M:%S')
359
	logger.setLevel(logging.INFO)
360
361
	def addHandler(filename, level):
362
		handler = logging.handlers.TimedRotatingFileHandler('arkbot-%s.log' % filename, when='midnight', backupCount=100)
363
		handler.setFormatter(formatter)
364
		handler.setLevel(level)
365
		logger.addHandler(handler)
366
367
	addHandler('info', logging.INFO)
368
	addHandler('errors', logging.WARNING)
369
370
	handler = logging.StreamHandler()
371
	handler.setFormatter(formatter)
372
	handler.setLevel(logging.INFO)
373
	logger.addHandler(handler)
374
375
	for arg in sys.argv[1:]:
376
		if arg == '-login':
377
			login = True
378
		elif arg == '-debug':
379
			logger.setLevel(logging.DEBUG)
380
			addHandler('debug', logging.DEBUG)
381
		else:
382
			print 'Error: unknown option "%s"' % arg
383
			sys.exit(1)
384
385
	bot = Arkbot(_userName, _wiki, logger)
386
	logger.info('Starting')
387
	try:
388
		if login:
389
			logger.info('Logging in with user name %s' % _userName)
390
			bot.login(getpass.getpass('Bot password ? '))
391
392
		# TASKS
393
		#print bot.info('Compression de données', 'Pondération de contextes')
394
		#print bot.read('en:Data compression')
395
		#for result in bot.search('"charmant village"'):
396
		#	print result
397
		#bot.replace('Utilisateur:Arkbot/test', r'((^|\s)[t|T]ext)(\s|$)', r'\1e\3')
398
		#bot.replace('Utilisateur:Arkbot/test', r'(^|\W)[cC]harmant(\s+)village(\W|$)', r'\1village\3', reason='non neutre')
399
400
		for result in bot.search('"charmant village"'):
401
			bot.replace(result, r'(^|\W)[cC]harmant(\s+)village(\W|$)', r'\1village\3', reason='non neutre')
402
		#bot.consolidate('Buddy Rogers (catcheur)')
403
404
		if login:
405
			logger.info('Logging out')
406
			bot.logout()
407
	except (ArkbotException), e:
408
		logger.error('%s' % e)
409
	logger.info('Finishing')
410
411
	logging.shutdown()
412
413
main()

Updated README Download diff

569d131db81c90b9771c0148981c10cf7ec83b048a084e970bdeeb8014b2211ea1eee11a34faa9ff
1
Arkbot v0.0
1
Arkbot v0.0
2
(C) 2009 Arkanosis
2
(C) 2009-2010 Arkanosis
3
arkanosis@gmail.com
3
arkanosis@gmail.com
4
4
5
http://trac-git.assembla.com/arkbot/
5
http://fr.wikipedia.org/wiki/Utilisateur:Arkbot
6
http://fr.wikipedia.org/wiki/Utilisateur:Arkbot
6
http://fr.wikipedia.org/wiki/Utilisateur:Arkanosis
7
http://fr.wikipedia.org/wiki/Utilisateur:Arkanosis