Difference between revisions of "User:WindBOT/CoreSource"
m |
(Added new built-in filter: Simplify anchors) |
||
(52 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
− | <pre>#!/usr/bin/python -OO | + | <pre><nowiki>#!/usr/bin/python -OO |
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||
# This program is free software: you can redistribute it and/or modify | # This program is free software: you can redistribute it and/or modify | ||
Line 15: | Line 15: | ||
import sys | import sys | ||
+ | import os | ||
+ | import time, datetime | ||
import re | import re | ||
+ | import hashlib | ||
import urllib2 | import urllib2 | ||
+ | import tempfile | ||
+ | import traceback | ||
+ | import threading | ||
import random | import random | ||
import subprocess | import subprocess | ||
+ | import cStringIO as StringIO | ||
+ | import shutil | ||
import wikitools | import wikitools | ||
+ | import wikiUpload | ||
+ | import feedparser | ||
+ | try: | ||
+ | import steam | ||
+ | except: | ||
+ | steam = None | ||
+ | from wikiUpload import wikiUploader | ||
from botConfig import config | from botConfig import config | ||
+ | if steam is not None and 'steamAPI' in config: | ||
+ | steam.set_api_key(config['steamAPI']) | ||
config['runtime'] = { | config['runtime'] = { | ||
'rcid': -1, | 'rcid': -1, | ||
Line 27: | Line 44: | ||
'wiki': None, | 'wiki': None, | ||
'edits': 0, | 'edits': 0, | ||
− | ' | + | 'regexes': {}, |
− | ' | + | 'pages': {}, |
+ | 'uploader': wikiUploader(config['username'], config['password'], config['api']) | ||
} | } | ||
Line 58: | Line 76: | ||
self.kwargs = kwargs | self.kwargs = kwargs | ||
def __str__(self): | def __str__(self): | ||
− | return u'< | + | if u(self.func) == u(regSub): |
+ | return 'Regex' + u(self.pending) | ||
+ | return u'<Curry of ' + u(self.func) + u'; args = ' + u(self.pending) + u'; kwargs = ' + u(self.kwargs) + u'>' | ||
def __repr__(self): | def __repr__(self): | ||
return self.__str__() | return self.__str__() | ||
Line 68: | Line 88: | ||
kw = kwargs or self.kwargs | kw = kwargs or self.kwargs | ||
return self.func(*(self.pending + args), **kw) | return self.func(*(self.pending + args), **kw) | ||
+ | class BatchScheduler: | ||
+ | def __init__(self, concurrency=16): | ||
+ | self.concurrency = 16 | ||
+ | self.tasks = [] | ||
+ | def schedule(self, target, *args, **kwargs): | ||
+ | self.tasks.append((target, args, kwargs)) | ||
+ | def execute(self): | ||
+ | while len(self.tasks): | ||
+ | pool = [] | ||
+ | numThreads = min(self.concurrency, len(self.tasks)) | ||
+ | for task in range(numThreads): | ||
+ | task = self.tasks[task] | ||
+ | t = threading.Thread(target=task[0], args=task[1], kwargs=task[2]) | ||
+ | t.start() | ||
+ | pool.append(t) | ||
+ | for t in pool: | ||
+ | t.join() | ||
+ | self.tasks = self.tasks[numThreads:] | ||
+ | def getTempFilename(extension=None): | ||
+ | global config | ||
+ | if extension is None: | ||
+ | f = tempfile.mkstemp(prefix=config['tempPrefix']) | ||
+ | else: | ||
+ | f = tempfile.mkstemp(suffix=u'.' + u(extension), prefix=config['tempPrefix']) | ||
+ | os.close(f[0]) # Damn you Python I just want a filename | ||
+ | return u(f[1]) | ||
def wiki(): | def wiki(): | ||
Line 83: | Line 129: | ||
return config['runtime']['wiki'] | return config['runtime']['wiki'] | ||
def page(p): | def page(p): | ||
+ | global config | ||
if type(p) in (type(''), type(u'')): | if type(p) in (type(''), type(u'')): | ||
− | p = wikitools.page.Page(wiki(), | + | p = u(p) |
− | + | if p not in config['runtime']['pages']: | |
− | + | config['runtime']['pages'][p] = wikitools.page.Page(wiki(), p, followRedir=False) | |
− | + | return config['runtime']['pages'][p] | |
+ | # Else, it is a page object | ||
+ | title = u(p.title) | ||
+ | if title not in config['runtime']['pages']: | ||
+ | config['runtime']['pages'][title] = p | ||
+ | return config['runtime']['pages'][title] | ||
+ | def getSummary(summary): | ||
summary = u(summary) | summary = u(summary) | ||
while len(summary) > 250: | while len(summary) > 250: | ||
Line 94: | Line 147: | ||
else: | else: | ||
summary = summary[:247] + u'...' | summary = summary[:247] + u'...' | ||
− | + | return summary | |
+ | def editPage(p, content, summary=u'', minor=True, bot=True, nocreate=True): | ||
+ | global config | ||
+ | summary = getSummary(summary) | ||
+ | p = page(p) | ||
+ | try: | ||
+ | print 'Editing', p.title, 'with summary', summary | ||
+ | except: | ||
+ | pass | ||
+ | try: | ||
+ | if nocreate: | ||
+ | if minor: | ||
+ | result = p.edit(u(content), summary=summary, minor=True, bot=bot, nocreate=nocreate) | ||
+ | else: | ||
+ | result = p.edit(u(content), summary=summary, notminor=True, bot=bot, nocreate=nocreate) | ||
+ | else: | ||
+ | if minor: | ||
+ | result = p.edit(u(content), summary=summary, minor=True, bot=bot) | ||
+ | else: | ||
+ | result = p.edit(u(content), summary=summary, notminor=True, bot=bot) | ||
+ | except: | ||
+ | warning('Couldn\'t edit', p.title) | ||
+ | return None | ||
try: | try: | ||
if result['edit']['result']: | if result['edit']['result']: | ||
config['runtime']['edits'] += 1 | config['runtime']['edits'] += 1 | ||
except: | except: | ||
− | warning('Couldn\'t edit', p) | + | warning('Couldn\'t edit', p.title) |
return result | return result | ||
+ | def deletePage(p, summary=False): | ||
+ | if summary: | ||
+ | summary = getSummary(summary) | ||
+ | return page(p).delete(summary) | ||
+ | def uploadFile(filename, destfile, pagecontent='', license='', overwrite=False, reupload=False): | ||
+ | global config | ||
+ | return config['runtime']['uploader'].upload(filename, destfile, pagecontent, license, overwrite=overwrite, reupload=reupload) | ||
def updateRCID(): | def updateRCID(): | ||
if abs(config['runtime']['rcid'] - config['runtime']['onlinercid']) >= config['rcidrate']: | if abs(config['runtime']['rcid'] - config['runtime']['onlinercid']) >= config['rcidrate']: | ||
Line 125: | Line 207: | ||
except: | except: | ||
warning('Couldn\'t update edit count.') | warning('Couldn\'t update edit count.') | ||
+ | |||
+ | # Because SOME LIBRARY will not use singletons, this has to be done at the bot level | ||
+ | # rather than the individual filter level to avoid loading the damn thing twice. | ||
+ | steamGameSchemas = {} | ||
+ | def steamGetGameSchema(game): | ||
+ | global steamGameSchemas | ||
+ | if steam is None: | ||
+ | return None | ||
+ | if game not in steamGameSchemas: | ||
+ | steamGameSchemas[game] = game.item_schema() | ||
+ | return steamGameSchemas[game] | ||
+ | steamGameAssets = {} | ||
+ | def steamGetGameAssets(game): | ||
+ | global steamGameAssets | ||
+ | if steam is None: | ||
+ | return None | ||
+ | if game not in steamGameAssets: | ||
+ | steamGameAssets[game] = game.assets() | ||
+ | return steamGameAssets[game] | ||
def compileRegex(regex, flags=re.IGNORECASE): | def compileRegex(regex, flags=re.IGNORECASE): | ||
Line 143: | Line 244: | ||
sys.exit(1) | sys.exit(1) | ||
+ | def setFilterName(f, name): | ||
+ | name = u(name) | ||
+ | f.__unicode__ = lambda: name | ||
+ | f.__str__ = lambda: name.encode('utf8') | ||
+ | f.filterName = name | ||
+ | return f | ||
class link: | class link: | ||
def __init__(self, content): | def __init__(self, content): | ||
content = u(content) | content = u(content) | ||
+ | self.joined = False | ||
self.setBody(content) | self.setBody(content) | ||
self.setType(u'unknown') | self.setType(u'unknown') | ||
+ | self.setLabel(None) | ||
+ | self.setLink(u'') | ||
+ | self.anchor = None | ||
self.joined = False | self.joined = False | ||
if len(content) > 2: | if len(content) > 2: | ||
Line 157: | Line 268: | ||
if lnk.find(u':') == -1: | if lnk.find(u':') == -1: | ||
lnk = lnk.replace(u'_', u' ') | lnk = lnk.replace(u'_', u' ') | ||
+ | anchor = None | ||
+ | if lnk.find(u'#') != -1: | ||
+ | lnk, anchor = lnk.split(u'#', 1) | ||
+ | self.setAnchor(anchor) | ||
self.setLink(lnk) | self.setLink(lnk) | ||
if len(split) == 2: | if len(split) == 2: | ||
Line 162: | Line 277: | ||
else: | else: | ||
self.setLabel(split[0]) | self.setLabel(split[0]) | ||
− | self.joined = | + | self.joined = anchor is None |
elif content[0] == u'[' and content[-1] == u']': | elif content[0] == u'[' and content[-1] == u']': | ||
split = content[1:-1].split(u' ', 1) | split = content[1:-1].split(u' ', 1) | ||
Line 175: | Line 290: | ||
def getBody(self): | def getBody(self): | ||
return u(self.body) | return u(self.body) | ||
− | def getLink(self): | + | def getLink(self, withAnchor=False): |
+ | if withAnchor and self.getAnchor() is not None: | ||
+ | return u(self.link) + u'#' + self.getAnchor() | ||
return u(self.link) | return u(self.link) | ||
+ | def getAnchor(self): | ||
+ | return self.anchor | ||
def getLabel(self): | def getLabel(self): | ||
if self.label is None: | if self.label is None: | ||
Line 188: | Line 307: | ||
self.body = u(body) | self.body = u(body) | ||
def setLink(self, link): | def setLink(self, link): | ||
− | self.link = u(link | + | link = u(link) |
+ | if self.getType() == u'internal' and link.find(u'#') != -1: | ||
+ | link, anchor = link.split(u'#', 1) | ||
+ | self.setAnchor(anchor) | ||
+ | self.link = link | ||
if self.joined: | if self.joined: | ||
self.label = u(link) | self.label = u(link) | ||
+ | replaceDots = compileRegex(r'(?:\.[a-f\d][a-f\d])+') | ||
+ | def _replaceDots(self, g): | ||
+ | s = '' | ||
+ | g = g.group(0) | ||
+ | for i in xrange(0, len(g), 3): | ||
+ | s += chr(int(g[i + 1:i + 3], 16)) | ||
+ | return s.decode('utf8') | ||
+ | def setAnchor(self, anchor): | ||
+ | if self.getType() == u'internal': | ||
+ | u(anchor).replace(u'_', u' ') | ||
+ | try: | ||
+ | anchor = link.replaceDots.sub(self._replaceDots, anchor) | ||
+ | except: | ||
+ | pass | ||
+ | self.anchor = anchor | ||
def setLabel(self, label): | def setLabel(self, label): | ||
if label is None: | if label is None: | ||
Line 201: | Line 339: | ||
return self.__unicode__() | return self.__unicode__() | ||
def __repr__(self): | def __repr__(self): | ||
− | return u'<Link-' + self.getType() + ': ' + self.__unicode__() + u'>' | + | return u'<Link-' + self.getType() + u': ' + self.__unicode__() + u'>' |
def __unicode__(self): | def __unicode__(self): | ||
label = self.getLabel() | label = self.getLabel() | ||
− | tmpLink = self.getLink() | + | tmpLink = self.getLink(withAnchor=True) |
if self.getType() == u'internal': | if self.getType() == u'internal': | ||
tmpLink2 = tmpLink.replace(u'_', u' ') | tmpLink2 = tmpLink.replace(u'_', u' ') | ||
− | if label in (tmpLink2, tmpLink): | + | if label in (tmpLink2, tmpLink) or (label and tmpLink and (label[0].lower() == tmpLink[0].lower() and tmpLink[1:] == label[1:]) or (label[0].lower() == tmpLink2[0].lower() and tmpLink2[1:] == label[1:])): |
return u'[[' + label + u']]' | return u'[[' + label + u']]' | ||
− | + | elif tmpLink and label and len(label) > len(tmpLink) and (label.lower().find(tmpLink2.lower()) == 0 or label.lower().find(tmpLink.lower()) == 0): | |
− | |||
− | elif tmpLink and label and len(label) > len(tmpLink) and (label.lower().find(tmpLink2.lower()) | ||
index = max(label.lower().find(tmpLink2.lower()), label.lower().find(tmpLink.lower())) | index = max(label.lower().find(tmpLink2.lower()), label.lower().find(tmpLink.lower())) | ||
badchars = (u' ', u'_') | badchars = (u' ', u'_') | ||
Line 226: | Line 362: | ||
return u'[' + tmpLink + u' ' + label + u']' | return u'[' + tmpLink + u' ' + label + u']' | ||
return self.getBody() | return self.getBody() | ||
+ | class template: | ||
+ | maxInlineParams = 1 | ||
+ | def __init__(self, content): | ||
+ | content = u(content) | ||
+ | self.changed = False | ||
+ | self.content = content | ||
+ | self.name = None | ||
+ | self.order = None | ||
+ | self.links = [] | ||
+ | self.params = [] | ||
+ | self.paramNum = 0 | ||
+ | self.indentation = {} | ||
+ | self.defaultIndent = 0 | ||
+ | if len(content) > 4 and content[:2] == '{{' and content[-2:] == '}}': | ||
+ | innerRegex = compileRegex(r'\s*\|\s*') | ||
+ | itemRegex = compileRegex(r'^(\S[^=]*?)\s*=\s*([\s\S]*?)$') | ||
+ | content = content[2:-2] | ||
+ | content, self.links = linkExtract(content) | ||
+ | innerStuff = innerRegex.split(content) | ||
+ | if innerStuff[0][:9].lower() == 'template:': | ||
+ | innerStuff[0] = innerStuff[0][9:] | ||
+ | self.name = u(innerStuff[0][0].upper() + innerStuff[0][1:]).replace(u'_', u' ').strip() | ||
+ | innerStuff = innerStuff[1:] | ||
+ | for i in innerStuff: | ||
+ | i = linkRestore(i.strip(), self.links, restore=True) | ||
+ | itemRes = itemRegex.search(i) | ||
+ | if itemRes: | ||
+ | self.params.append((u(itemRes.group(1)).lower(), u(itemRes.group(2)))) | ||
+ | else: | ||
+ | self.paramNum += 1 | ||
+ | self.params.append((u(self.paramNum), i)) | ||
+ | self.originalContent = self.content | ||
+ | self.originalParams = self.params[:] | ||
+ | self.originalName = self.name | ||
+ | self.forceindent = False | ||
+ | def indentationMatters(self, doesitmatter=True): | ||
+ | self.forceindent = doesitmatter | ||
+ | def getName(self): | ||
+ | return self.name | ||
+ | def setName(self, name): | ||
+ | self.name = u(name).replace(u'_', u' ') | ||
+ | self.changed = self.changed or self.name != self.originalName | ||
+ | def getParam(self, key): | ||
+ | key = u(key).lower() | ||
+ | for k, v in self.params: | ||
+ | if k == key: | ||
+ | return v | ||
+ | return None | ||
+ | def delParam(self, *indexes): | ||
+ | for index in indexes: | ||
+ | index = u(index) | ||
+ | isNumber = self.isInt(index) | ||
+ | for p in range(len(self.params)): | ||
+ | k, v = self.params[p] | ||
+ | if k == index: | ||
+ | self.changed = True | ||
+ | self.params = self.params[:p] + self.params[p+1:] | ||
+ | if isNumber: | ||
+ | if int(index) == self.paramNum: | ||
+ | self.paramNum -= 1 | ||
+ | break | ||
+ | def setParam(self, index=None, value=u''): | ||
+ | if value is None: | ||
+ | return self.delParam(index) | ||
+ | if index is None: | ||
+ | index = u(self.paramNum) | ||
+ | else: | ||
+ | index = u(index).lower() | ||
+ | isNumber = self.isInt(index) | ||
+ | value = u(value) | ||
+ | hasChanged = False | ||
+ | for p in range(len(self.params)): | ||
+ | k, v = self.params[p] | ||
+ | if k == index: | ||
+ | self.changed = self.changed or v != value | ||
+ | self.params[p] = (k, value) | ||
+ | hasChanged = True | ||
+ | break | ||
+ | if not hasChanged: | ||
+ | if isNumber: | ||
+ | while self.numParam < int(index) - 1: | ||
+ | self.appendParam(u'') | ||
+ | self.params.append((index, value)) | ||
+ | self.changed = True | ||
+ | def appendParam(self, value=u''): | ||
+ | self.paramNum += 1 | ||
+ | self.params.append((u(self.paramNum), value)) | ||
+ | def setPreferedIndentation(self, index, indent=0): | ||
+ | self.indentation[u(index)] = indent | ||
+ | self.changed = self.changed or self.forceindent | ||
+ | def setDefaultIndentation(self, indent=0): | ||
+ | self.defaultIndent = indent | ||
+ | self.changed = self.changed or self.forceindent | ||
+ | def setPreferedOrder(self, order=None): | ||
+ | order2 = [] | ||
+ | for o in order: | ||
+ | order2.append(u(o)) | ||
+ | self.order = order2 | ||
+ | oldParams = self.params[:] | ||
+ | self.changed = self.changed or self.fixOrder() == oldParams | ||
+ | def renameParam(self, oldkey, newkey): | ||
+ | oldkey, newkey = u(oldkey).lower(), u(newkey).lower() | ||
+ | if oldkey == newkey: | ||
+ | return | ||
+ | for p in range(len(self.params)): | ||
+ | k, v = self.params[p] | ||
+ | if k == oldkey: | ||
+ | self.params[p] = (newkey, v) | ||
+ | self.changed = True | ||
+ | break | ||
+ | def fixOrder(self): | ||
+ | if self.order is None: | ||
+ | return self.params | ||
+ | newParams = [] | ||
+ | doneParams = [] | ||
+ | for k in self.order: | ||
+ | k = u(k) | ||
+ | if self.getParam(k) is not None: | ||
+ | newParams.append((k, self.getParam(k))) | ||
+ | doneParams.append(k) | ||
+ | for k, v in self.params: | ||
+ | if k not in doneParams: | ||
+ | doneParams.append(k) | ||
+ | newParams.append((k, v)) | ||
+ | self.params = newParams | ||
+ | return self.params | ||
+ | def defined(self): | ||
+ | return self.name is not None | ||
+ | def isInt(self, i): | ||
+ | try: | ||
+ | return u(int(i)) == u(i) and int(i) > 0 | ||
+ | except: | ||
+ | return False | ||
+ | def __str__(self): | ||
+ | return self.__unicode__() | ||
+ | def __repr__(self): | ||
+ | return u'<Template-' + self.getName() + u': ' + self.__unicode__() + u'>' | ||
+ | def __unicode__(self): | ||
+ | if not self.defined(): | ||
+ | return u'' | ||
+ | if not self.changed: | ||
+ | return self.originalContent | ||
+ | self.fixOrder() | ||
+ | params = [self.name] | ||
+ | indentMode = len(self.params) > template.maxInlineParams or self.forceindent | ||
+ | maxIndent = 0 | ||
+ | if indentMode: | ||
+ | for k, v in self.params: | ||
+ | l = len(k) + self.defaultIndent | ||
+ | if k in self.indentation: | ||
+ | l = len(k) + self.indentation[k] | ||
+ | if not self.isInt(k) and l > maxIndent: | ||
+ | maxIndent = l | ||
+ | numParam = 1 | ||
+ | for k, v in self.params: | ||
+ | indent = self.defaultIndent | ||
+ | if k in self.indentation and indentMode: | ||
+ | indent = self.indentation[k] | ||
+ | try: | ||
+ | isNumber = u(int(index)) == u(index) and int(index) > 0 | ||
+ | except: | ||
+ | isNumber = False | ||
+ | if indentMode: | ||
+ | key = u' ' * indent + u'| ' | ||
+ | addKey = True | ||
+ | if self.isInt(k): | ||
+ | if int(k) == numParam: | ||
+ | addKey = False | ||
+ | numParam += 1 | ||
+ | if addKey: | ||
+ | key += k + (u' ' * max(0, maxIndent - len(k) - indent)) + u' = ' | ||
+ | else: | ||
+ | key = u'' | ||
+ | addKey = True | ||
+ | if self.isInt(k): | ||
+ | if int(k) == numParam: | ||
+ | addKey = False | ||
+ | numParam += 1 | ||
+ | if addKey: | ||
+ | key += k + u' = ' | ||
+ | params.append(key + v) | ||
+ | if indentMode: | ||
+ | params = u'\n'.join(params) + u'\n' | ||
+ | else: | ||
+ | params = u' | '.join(params) | ||
+ | return u'{{' + params + u'}}' | ||
def linkExtract(content): | def linkExtract(content): | ||
+ | content = u(content) | ||
links1 = compileRegex(r'\[\[([^\[\]]+)\]\]') | links1 = compileRegex(r'\[\[([^\[\]]+)\]\]') | ||
links2 = compileRegex(r'\[([^\[\]]+)\](?!\])') | links2 = compileRegex(r'\[([^\[\]]+)\](?!\])') | ||
Line 244: | Line 567: | ||
res = links2.search(content) | res = links2.search(content) | ||
return content, linklist | return content, linklist | ||
− | def linkRestore(content, | + | def templateExtract(content): |
+ | content = u(content) | ||
+ | templatesR = compileRegex(r'\{\{([^\{\}]+)\}\}') | ||
+ | templatecount = 0 | ||
+ | templatelist = [] | ||
+ | res = templatesR.search(content) | ||
+ | while res: | ||
+ | templatelist.append(template(res.group())) | ||
+ | content = content[:res.start()] + u'~!~!~!~OMGTEMPLATE-' + u(templatecount) + u'~!~!~!~' + content[res.end():] | ||
+ | templatecount += 1 | ||
+ | res = templatesR.search(content) | ||
+ | return content, templatelist | ||
+ | def blankAround(content, search, repl=u''): | ||
+ | content = u(content) | ||
+ | search = u(search) | ||
+ | repl = u(repl) | ||
+ | blank = compileRegex(u'(\\s*)' + u(re.escape(search)) + u'(\\s*)') | ||
+ | res = blank.search(content) | ||
+ | if not res: | ||
+ | return content.replace(search, repl) | ||
+ | if u(res.group(0)) == content: | ||
+ | return repl | ||
+ | if len(res.group(1)) < len(res.group(2)): | ||
+ | return content[:res.end(1)] + content[res.end(2):] | ||
+ | else: | ||
+ | return content[:res.start()] + content[res.start(2):] | ||
+ | def linkRestore(content, links=[], restore=False): | ||
+ | linklist=links[:] | ||
linkcount = len(linklist) | linkcount = len(linklist) | ||
i = 0 | i = 0 | ||
Line 250: | Line 600: | ||
for l in linklist: | for l in linklist: | ||
i += 1 | i += 1 | ||
− | content = content.replace(u'~!~!~!~OMGLINK-' + u(linkcount - i) + u'~!~!~!~', u(l)) | + | if l is None: |
+ | content = blankAround(content, u'~!~!~!~OMGLINK-' + u(linkcount - i) + u'~!~!~!~', u'') | ||
+ | else: | ||
+ | if restore: | ||
+ | l = l.getBody() | ||
+ | content = content.replace(u'~!~!~!~OMGLINK-' + u(linkcount - i) + u'~!~!~!~', u(l)) | ||
+ | return content | ||
+ | def templateRestore(content, templatelist=[]): | ||
+ | templatecount = len(templatelist) | ||
+ | i = 0 | ||
+ | templatelist.reverse() | ||
+ | for t in templatelist: | ||
+ | i += 1 | ||
+ | if t is None: | ||
+ | content = blankAround(content, u'~!~!~!~OMGTEMPLATE-' + u(templatecount - i) + u'~!~!~!~', u'') | ||
+ | else: | ||
+ | content = content.replace(u'~!~!~!~OMGTEMPLATE-' + u(templatecount - i) + u'~!~!~!~', u(t)) | ||
return content | return content | ||
def safeContent(content): | def safeContent(content): | ||
− | |||
safelist = [] | safelist = [] | ||
− | + | tags = compileRegex(r'<(?:ref|gallery|pre|code)[^<>]*>[\S\s]*?</(?:ref|gallery|pre|code)>', re.IGNORECASE | re.MULTILINE) | |
− | + | comments = compileRegex(r'<!--[\S\s]*?-->') | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | tags = compileRegex(r'<(?:ref|gallery|pre|code)[^<>]*>[\S\s]*?</(?:ref|gallery|pre|code)> | ||
tagcount = 0 | tagcount = 0 | ||
res = tags.search(content) | res = tags.search(content) | ||
+ | if not res: | ||
+ | res = comments.search(content) | ||
while res: | while res: | ||
safelist.append(('~!~!~!~OMGTAG-' + u(tagcount) + u'~!~!~!~', u(res.group()))) | safelist.append(('~!~!~!~OMGTAG-' + u(tagcount) + u'~!~!~!~', u(res.group()))) | ||
Line 271: | Line 631: | ||
tagcount += 1 | tagcount += 1 | ||
res = tags.search(content) | res = tags.search(content) | ||
− | return content | + | if not res: |
− | def safeContentRestore(content | + | res = comments.search(content) |
+ | return content, safelist | ||
+ | def safeContentRestore(content, safelist=[]): | ||
safelist.reverse() | safelist.reverse() | ||
for s in safelist: | for s in safelist: | ||
content = content.replace(s[0], s[1]) | content = content.replace(s[0], s[1]) | ||
− | |||
return content | return content | ||
− | |||
def regReplaceCallBack(sub, match): | def regReplaceCallBack(sub, match): | ||
groupcount = 1 | groupcount = 1 | ||
Line 291: | Line 651: | ||
content = u(content) | content = u(content) | ||
for regex in regexes.keys(): | for regex in regexes.keys(): | ||
− | compiled = compileRegex(u(regex), re.IGNORECASE | re.DOTALL | re.MULTILINE) | + | if type(regex) in (type(()), type([])): |
+ | compiled = compileRegex(u(regex[0]), regex[1]) | ||
+ | else: | ||
+ | compiled = compileRegex(u(regex), re.IGNORECASE | re.DOTALL | re.MULTILINE) | ||
callback = curry(regReplaceCallBack, u(regexes[regex])) | callback = curry(regReplaceCallBack, u(regexes[regex])) | ||
oldcontent = u'' | oldcontent = u'' | ||
Line 303: | Line 666: | ||
content = content.replace(u(s), u(strings[s])) | content = content.replace(u(s), u(strings[s])) | ||
return content | return content | ||
− | def sFilter(filters, content, **kwargs): | + | def filterEnabled(f, **kwargs): |
+ | if type(f) is not type(()): | ||
+ | return True | ||
+ | if len(f) < 2: | ||
+ | return True | ||
+ | if type(f[1]) is not type({}): | ||
+ | return True | ||
+ | article = None | ||
+ | if 'article' in kwargs.keys(): | ||
+ | article = kwargs['article'] | ||
+ | if article is None: | ||
+ | return True | ||
+ | if type(article) not in (type(u''), type('')): | ||
+ | article = article.title | ||
+ | if article is None: | ||
+ | return True | ||
+ | if 'languageBlacklist' in f[1].keys(): | ||
+ | for i in f[1]['languageBlacklist']: | ||
+ | if compileRegex(u'/' + u(i) + u'$').search(u(article)): | ||
+ | return False | ||
+ | return True | ||
+ | if 'languageWhitelist' in f[1].keys(): | ||
+ | for i in f[1]['languageWhitelist']: | ||
+ | if compileRegex(u'/' + u(i) + u'$').search(u(article)): | ||
+ | return True | ||
+ | return False | ||
+ | if 'language' in f[1].keys(): | ||
+ | return compileRegex(u'/' + u(f[1]['language']) + u'$').search(u(article)) | ||
+ | return True | ||
+ | scheduledTasks = [] | ||
+ | def scheduleTask(task, oneinevery): | ||
+ | global scheduledTasks | ||
+ | result = random.randint(0, oneinevery-1) | ||
+ | print 'Task:', task, '; result:', result | ||
+ | if not result: | ||
+ | scheduledTasks.append(task) | ||
+ | def runScheduledTasks(): | ||
+ | global scheduledTasks | ||
+ | if not len(scheduledTasks): | ||
+ | print 'No tasks scheduled.' | ||
+ | return | ||
+ | print 'Running scheduled tasks...' | ||
+ | for t in scheduledTasks: | ||
+ | print 'Running task:', t | ||
+ | try: | ||
+ | t() | ||
+ | print 'End of task:', t | ||
+ | except: | ||
+ | print 'Error while executing task:', t | ||
+ | def sFilter(filters, content, returnActive=False, **kwargs): | ||
content = u(content) | content = u(content) | ||
lenfilters = len(filters) | lenfilters = len(filters) | ||
if not lenfilters: | if not lenfilters: | ||
+ | if returnActive: | ||
+ | return content, [] | ||
return content | return content | ||
− | |||
filtercount = 0 | filtercount = 0 | ||
+ | activeFilters = [] | ||
for f in filters: | for f in filters: | ||
+ | if not filterEnabled(f, **kwargs): | ||
+ | continue | ||
+ | if type(f) is type(()): | ||
+ | f, params = f | ||
filtercount += 1 | filtercount += 1 | ||
#print 'Filter', f, '(', filtercount, '/', lenfilters, ')' | #print 'Filter', f, '(', filtercount, '/', lenfilters, ')' | ||
loopTimes = 0 | loopTimes = 0 | ||
− | while not loopTimes or | + | beforeFilter = u'' |
+ | while not loopTimes or beforeFilter != content: | ||
loopTimes += 1 | loopTimes += 1 | ||
− | if loopTimes >= | + | if loopTimes >= config['filterPasses']: |
− | print 'Warning: More than | + | print 'Warning: More than', config['filterPasses'], 'loops with filter', u(f) |
break | break | ||
− | + | beforeFilter = content | |
content = u(f(content, **kwargs)) | content = u(f(content, **kwargs)) | ||
+ | if content != beforeFilter and f not in activeFilters: | ||
+ | activeFilters.append(f) | ||
+ | if returnActive: | ||
+ | return content, activeFilters | ||
return content | return content | ||
− | def linkFilter(filters, linklist, **kwargs): | + | def linkFilter(filters, linklist, returnActive=False, **kwargs): |
+ | activeFilters = [] | ||
for f in filters: | for f in filters: | ||
+ | if not filterEnabled(f, **kwargs): | ||
+ | continue | ||
+ | if type(f) is type(()): | ||
+ | f, params = f | ||
for i in range(len(linklist)): | for i in range(len(linklist)): | ||
− | linklist[i] = f(linklist[i], **kwargs) | + | if linklist[i] is not None and isinstance(linklist[i], link): |
+ | oldLink = u(linklist[i]) | ||
+ | linklist[i] = f(linklist[i], **kwargs) | ||
+ | if oldLink != u(linklist[i]) and f not in activeFilters: | ||
+ | activeFilters.append(f) | ||
+ | if returnActive: | ||
+ | return linklist, activeFilters | ||
return linklist | return linklist | ||
− | def | + | def templateFilter(filters, templatelist, returnActive=False, **kwargs): |
− | for i in range(len( | + | activeFilters = [] |
− | + | for f in filters: | |
− | if l.getType() == u'internal' and l.getLink().find(u':') == -1 and pageFilter(l.getLink()): | + | if not filterEnabled(f, **kwargs): |
− | + | continue | |
− | + | if type(f) is type(()): | |
− | + | f, params = f | |
− | + | for i in range(len(templatelist)): | |
− | + | if templatelist[i] is not None and isinstance(templatelist[i], template): | |
− | return | + | oldTemplate = u(templatelist[i]) |
− | + | templatelist[i] = f(templatelist[i], **kwargs) | |
+ | if oldTemplate != u(templatelist[i]) and f not in activeFilters: | ||
+ | activeFilters.append(f) | ||
+ | if returnActive: | ||
+ | return templatelist, activeFilters | ||
+ | return templatelist | ||
+ | def linkTextFilter(subfilters, l, linksafe=False, **kwargs): | ||
+ | if l.getType() == u'internal' and l.getLink().find(u':') == -1 and pageFilter(l.getLink()): | ||
+ | if linksafe: | ||
+ | l.setLink(sFilter(subfilters, l.getLink(), **kwargs)) | ||
+ | if l.getLabel().find(u':') == -1: | ||
+ | l.setLabel(sFilter(subfilters, l.getLabel(), **kwargs)) | ||
+ | return l | ||
+ | def linkDomainSub(fromDomain, toDomain, link, **kwargs): | ||
+ | domainR = compileRegex(r'^(https?://(?:[-\w]+\.)*)' + u(re.escape(fromDomain)) + r'(\S+)$') | ||
+ | toDomain = u(toDomain) | ||
+ | if link.getType() == 'external': | ||
+ | linkInfo = domainR.search(link.getLink()) | ||
+ | if linkInfo: | ||
+ | link.setLink(u(linkInfo.group(1)) + toDomain + u(linkInfo.group(2))) | ||
+ | return link | ||
+ | def linkDomainFilter(fromDomain, toDomain): | ||
+ | return curry(linkDomainSub, fromDomain, toDomain) | ||
def regexes(rs): | def regexes(rs): | ||
return curry(regSub, rs) | return curry(regSub, rs) | ||
Line 343: | Line 799: | ||
return regexes({reg: replace}) | return regexes({reg: replace}) | ||
def dumbReplaces(rs): | def dumbReplaces(rs): | ||
− | return curry(dumbReplacement, rs) | + | return setFilterName(curry(dumbReplacement, rs), u'DumbReplacements(' + u(rs) + u')') |
def dumbReplace(subject, replacement): | def dumbReplace(subject, replacement): | ||
− | return | + | return setFilterName(dumbReplaces({subject: replacement}), u'DumbReplacement(' + u(subject) + u' \u2192 ' + u(replacement) + u')') |
− | def wordRegex(word): | + | def wordRegex(word, **kwargs): |
− | word = u(re.sub(r' +', r'[-_ ]', u(word))) | + | flags = None |
− | + | if type(word) in (type(()), type([])): | |
+ | flags = word[1] | ||
+ | word = word[0] | ||
+ | word = u(re.sub(r'[-_ ]+', r'[-_ ]', u(word))) | ||
+ | word = u(r"(?<![\u00E8-\u00F8\xe8-\xf8\w])(?<!'')(?<!" + r'"' + ")(?:\\b|(?<=[ \\[\\]\\(\\):;.,\"'*\\xab\\xbb])|^)" + word + r"(?:\b(?![\u00E8-\u00F8\xe8-\xf8\w])(?!''|" + r'"' + ")|(?=[ \\[\\]\(\\):;.,\"'*\\xab\\xbb])|$)") | ||
+ | if flags is None: | ||
+ | return word | ||
+ | return (word, flags) | ||
def wordFilter(correct, *badwords, **kwargs): | def wordFilter(correct, *badwords, **kwargs): | ||
correct = u(correct) | correct = u(correct) | ||
Line 354: | Line 817: | ||
badwords2 = [] | badwords2 = [] | ||
for i in badwords: | for i in badwords: | ||
− | badwords2.append(u(i)) | + | if type(i) in (type(()), type([])): |
+ | badwords2.extend(map(u, i)) | ||
+ | else: | ||
+ | badwords2.append(u(i)) | ||
if not len(badwords2): | if not len(badwords2): | ||
badwords2.append(correct) | badwords2.append(correct) | ||
for w in badwords2: | for w in badwords2: | ||
− | rs[wordRegex(w)] = correct | + | if 'keepcapitalization' in kwargs and kwargs['keepcapitalization']: |
− | return regexes(rs) | + | if type(w) not in (type(()), type([])): |
+ | w = (w, 0) | ||
+ | else: | ||
+ | w = (w[0], 0) | ||
+ | rs[wordRegex(w, **kwargs)] = correct | ||
+ | rs[wordRegex((w[0][0].swapcase() + w[0][1:], w[1]), **kwargs)] = correct[0].swapcase() + correct[1:] | ||
+ | else: | ||
+ | rs[wordRegex(w, **kwargs)] = correct | ||
+ | return setFilterName(regexes(rs), u'WordFilter(' + u'/'.join(badwords2) + u' \u2192 ' + correct + u')') | ||
def enforceCapitalization(*words, **kwargs): | def enforceCapitalization(*words, **kwargs): | ||
for w in words: | for w in words: | ||
− | addSafeFilter(wordFilter(u(w), **kwargs) | + | addSafeFilter(setFilterName(wordFilter(u(w)), u'EnforceCapitalization(' + u(w) + u')'), **kwargs) |
− | |||
pageFilters = [] | pageFilters = [] | ||
+ | pageWhitelist = [] | ||
categoryFilters = [] | categoryFilters = [] | ||
def pageFilter(page): | def pageFilter(page): | ||
− | global pageFilters | + | global pageFilters, pageWhitelist |
if type(page) in (type(()), type([])): | if type(page) in (type(()), type([])): | ||
pages = [] | pages = [] | ||
Line 377: | Line 851: | ||
page = page.title | page = page.title | ||
page = u(page) | page = u(page) | ||
+ | if page in pageWhitelist: | ||
+ | return True | ||
for f in pageFilters: | for f in pageFilters: | ||
if f.search(page): | if f.search(page): | ||
Line 391: | Line 867: | ||
global pageFilters | global pageFilters | ||
for f in filters: | for f in filters: | ||
− | pageFilters.append(compileRegex | + | pageFilters.append(compileRegex(f)) |
def addBlacklistPage(*pages): | def addBlacklistPage(*pages): | ||
for p in pages: | for p in pages: | ||
addPageFilter(re.escape(u(p))) | addPageFilter(re.escape(u(p))) | ||
+ | def addWhitelistPage(*pages): | ||
+ | global pageWhitelist | ||
+ | for p in pages: | ||
+ | if type(p) in (type([]), type(())): | ||
+ | addWhitelistPage(*p) | ||
+ | elif u(p) not in pageWhitelist: | ||
+ | pageWhitelist.append(u(p)) | ||
def addBlacklistCategory(*categories): | def addBlacklistCategory(*categories): | ||
global categoryFilters | global categoryFilters | ||
Line 409: | Line 892: | ||
addBlacklistPage(l) | addBlacklistPage(l) | ||
− | filters = [] | + | filters = { |
− | + | 'regular': [], | |
− | + | 'safe': [], | |
− | def | + | 'link': [], |
+ | 'template': [], | ||
+ | 'file': [] | ||
+ | } | ||
+ | def addFilterType(filterType, *fs, **kwargs): | ||
global filters | global filters | ||
for f in fs: | for f in fs: | ||
− | filters.append(f) | + | f = (f, kwargs) |
− | def | + | if f not in filters[filterType]: |
− | global | + | filters[filterType].append(f) |
+ | def delFilterType(filterType, *fs, **kwargs): | ||
+ | global filters | ||
for f in fs: | for f in fs: | ||
− | + | f = (f, kwargs) | |
− | def addLinkFilter(*fs): | + | if f in filters[filterType]: |
− | + | filters[filterType].remove(f) | |
− | for f in | + | def addFilter(*fs, **kwargs): |
− | + | addFilterType('regular', *fs, **kwargs) | |
− | def fixContent(content, article=None): | + | def delFilter(*fs, **kwargs): |
− | global filters | + | delFilterType('regular', *fs, **kwargs) |
+ | def addSafeFilter(*fs, **kwargs): | ||
+ | addFilterType('safe', *fs, **kwargs) | ||
+ | def delSafeFilter(*fs, **kwargs): | ||
+ | delFilterType('safe', *fs, **kwargs) | ||
+ | def addLinkFilter(*fs, **kwargs): | ||
+ | addFilterType('link', *fs, **kwargs) | ||
+ | def delLinkFilter(*fs, **kwargs): | ||
+ | delFilterType('link', *fs, **kwargs) | ||
+ | def addTemplateFilter(*fs, **kwargs): | ||
+ | addFilterType('template', *fs, **kwargs) | ||
+ | def delTemplateFilter(*fs, **kwargs): | ||
+ | delFilterType('template', *fs, **kwargs) | ||
+ | def addFileFilter(*fs, **kwargs): | ||
+ | addFilterType('file', *fs, **kwargs) | ||
+ | def delFileFilter(*fs, **kwargs): | ||
+ | delFilterType('file', *fs, **kwargs) | ||
+ | def filterRepr(filters): | ||
+ | s = [] | ||
+ | reprRegex = compileRegex(r'^<function (\S+)') | ||
+ | for f in filters: | ||
+ | if type(f) in (type([]), type(())) and not len(f[1]): | ||
+ | f = f[0] # Omit parameters if there are none | ||
+ | try: | ||
+ | name = f.filterName | ||
+ | s.append(name) | ||
+ | except: | ||
+ | res = reprRegex.search(u(f)) | ||
+ | if res: | ||
+ | filterR = u(res.group(1)) | ||
+ | if filterR not in s: | ||
+ | s.append(filterR) | ||
+ | elif u(f) not in s: | ||
+ | s.append(u(f)) | ||
+ | if not len(s): | ||
+ | return u'Built-in filters' # Link simplification, template formatting, etc | ||
+ | return u', '.join(s) | ||
+ | def fixContent(content, article=None, returnActive=False, **kwargs): | ||
+ | global filters | ||
content = u(content) | content = u(content) | ||
oldcontent = u'' | oldcontent = u'' | ||
loopTimes = 0 | loopTimes = 0 | ||
+ | redirect = False | ||
+ | activeFilters = [] | ||
+ | if len(content) > 9: | ||
+ | redirect = content[:9] == u'#REDIRECT' | ||
+ | if article is not None: | ||
+ | article = page(article) | ||
while not loopTimes or content != oldcontent: | while not loopTimes or content != oldcontent: | ||
loopTimes += 1 | loopTimes += 1 | ||
if loopTimes > 2: | if loopTimes > 2: | ||
print 'Pass', loopTimes, 'on', article | print 'Pass', loopTimes, 'on', article | ||
− | if loopTimes >= | + | if loopTimes >= config['pagePasses']: |
− | print 'Warning: More than | + | print 'Warning: More than', config['pagePasses'], 'fix passes on article', u(article.title) |
break | break | ||
oldcontent = content | oldcontent = content | ||
# Apply unsafe filters | # Apply unsafe filters | ||
− | content = sFilter(filters, content, article=article) | + | content, activeF = sFilter(filters['regular'], content, returnActive=True, article=article, redirect=redirect) |
+ | activeFilters.extend(activeF) | ||
# Apply safe filters | # Apply safe filters | ||
− | + | content, safelist = safeContent(content) | |
− | + | content, templatelist = templateExtract(content) | |
− | linklist = | + | content, linklist = linkExtract(content) |
− | content = | + | content, activeF = sFilter(filters['safe'], content, returnActive=True, article=article, redirect=redirect) |
− | + | activeFilters.extend(activeF) | |
− | if | + | extraLinks = setFilterName(curry(linkTextFilter, filters['safe']), u'(Content filters applied to links)') |
− | + | addLinkFilter(extraLinks) | |
− | + | if not redirect: | |
− | + | linklist, activeF = linkFilter(filters['link'], linklist, returnActive=True, article=article, redirect=redirect) | |
+ | activeFilters.extend(activeF) | ||
+ | content = linkRestore(content, linklist) | ||
+ | templatelist, activeF = templateFilter(filters['template'], templatelist, returnActive=True, article=article, redirect=redirect) | ||
+ | activeFilters.extend(activeF) | ||
+ | content = templateRestore(content, templatelist) | ||
+ | content = safeContentRestore(content, safelist) | ||
+ | delLinkFilter(extraLinks) | ||
+ | if article is not None and u(article.title)[:5] == 'File:': | ||
+ | # Apply file filters | ||
+ | content, activeF = sFilter(filters['file'], content, returnActive=True, article=article, redirect=redirect) | ||
+ | activeFilters.extend(activeF) | ||
+ | if returnActive: | ||
+ | return content, activeFilters | ||
return content | return content | ||
def fixPage(article, **kwargs): | def fixPage(article, **kwargs): | ||
Line 466: | Line 1,013: | ||
return | return | ||
originalContent = u(article.getWikiText()) | originalContent = u(article.getWikiText()) | ||
− | content = fixContent(originalContent, article=article) | + | content, activeFilters = fixContent(originalContent, returnActive=True, article=article) |
if content != originalContent: | if content != originalContent: | ||
print article, 'needs to be updated.' | print article, 'needs to be updated.' | ||
− | summary = u' | + | summary = u'Auto: ' + filterRepr(activeFilters) |
if 'reason' in kwargs: | if 'reason' in kwargs: | ||
summary += u' (' + u(kwargs['reason']) + u')' | summary += u' (' + u(kwargs['reason']) + u')' | ||
− | editPage(article, content, summary=summary) | + | if 'fake' in kwargs: |
+ | print '-------- New content is: --------' | ||
+ | print content | ||
+ | print '---------------------------------' | ||
+ | else: | ||
+ | editPage(article, content, summary=summary) | ||
return True | return True | ||
print article, 'is up-to-date.' | print article, 'is up-to-date.' | ||
Line 478: | Line 1,030: | ||
def patrol(change): | def patrol(change): | ||
global config | global config | ||
− | if int(change['rcid']) <= config['runtime']['rcid'] or not pageFilter(change['title']) or | + | secondsElapsed = (datetime.datetime.utcnow() - datetime.datetime.fromtimestamp(time.mktime(time.strptime(change['timestamp'], r'%Y-%m-%dT%H:%M:%SZ')))) |
+ | totalTime = secondsElapsed.seconds + secondsElapsed.days * 86400 | ||
+ | if int(change['rcid']) <= config['runtime']['rcid'] or not pageFilter(change['title']) or totalTime <= config['freshnessThreshold']: | ||
print 'Skipping', change['rcid'], change['title'] | print 'Skipping', change['rcid'], change['title'] | ||
if int(change['rcid']) > config['runtime']['rcid']: | if int(change['rcid']) > config['runtime']['rcid']: | ||
Line 486: | Line 1,040: | ||
config['runtime']['rcid'] = int(change['rcid']) | config['runtime']['rcid'] = int(change['rcid']) | ||
result = fixPage(change['title'], reason=u'Review RC#' + u(change['rcid'])) | result = fixPage(change['title'], reason=u'Review RC#' + u(change['rcid'])) | ||
− | |||
updateRCID() | updateRCID() | ||
def loadPage(p): | def loadPage(p): | ||
Line 494: | Line 1,047: | ||
except: | except: | ||
error('Couldn\'t grab page', p) | error('Couldn\'t grab page', p) | ||
− | coderegex = compileRegex(r'^(?: | + | coderegex = compileRegex(r'^(?: [^\r\n]*(?:[\r\n]+|$))+', re.MULTILINE) |
− | trimcode = compileRegex(r'^ | + | trimcode = compileRegex(r'^ |</?nowiki>', re.MULTILINE) |
for m in coderegex.finditer(code): | for m in coderegex.finditer(code): | ||
try: | try: | ||
Line 512: | Line 1,065: | ||
except: | except: | ||
error('Error while trying to grab recent changes.') | error('Error while trying to grab recent changes.') | ||
+ | uniquePages = [] | ||
+ | uniqueChanges = {} | ||
for change in recentChanges: | for change in recentChanges: | ||
+ | if change['title'] not in uniquePages: | ||
+ | uniquePages.append(change['title']) | ||
+ | if change['title'] not in uniqueChanges or uniqueChanges[change['title']]['rcid'] < change['rcid']: | ||
+ | uniqueChanges[change['title']] = change | ||
+ | # Move page to end of queue | ||
+ | uniquePages.remove(change['title']) | ||
+ | uniquePages.append(change['title']) | ||
+ | for title in uniquePages: | ||
+ | change = uniqueChanges[title] | ||
try: | try: | ||
patrol(change) | patrol(change) | ||
Line 527: | Line 1,091: | ||
if l[:l.find(u':')].lower() == 'category': | if l[:l.find(u':')].lower() == 'category': | ||
subpages = wikitools.category.Category(wiki(), l[l.find(u':')+1:]).getAllMembers(titleonly=True) | subpages = wikitools.category.Category(wiki(), l[l.find(u':')+1:]).getAllMembers(titleonly=True) | ||
− | + | for s in subpages: | |
− | + | if s not in links: | |
− | + | links.append(s) | |
− | + | newLink, links = parsePageRequest(s, links=links) | |
− | + | content.append(newLink) | |
− | |||
if len(content): | if len(content): | ||
selfContent += u'\r\n' + u'\r\n'.join(content) | selfContent += u'\r\n' + u'\r\n'.join(content) | ||
Line 583: | Line 1,146: | ||
else: | else: | ||
editPage(requestPage, requests, summary=u'Finished all requests. Processed: [[:' + u']], [[:'.join(tofix) + u']]') | editPage(requestPage, requests, summary=u'Finished all requests. Processed: [[:' + u']], [[:'.join(tofix) + u']]') | ||
+ | def parseLocaleFile(content, language='english', languages={}): | ||
+ | content = u(content) | ||
+ | language = u(language) | ||
+ | if content.find('Tokens') != -1: | ||
+ | content = content[content.find('Tokens')+6:] | ||
+ | regexSplit = compileRegex('\n(?=\s*")', re.IGNORECASE | re.MULTILINE) | ||
+ | content = regexSplit.split(content) | ||
+ | regexLang = compileRegex(r'^"\[([-\w]+)\]([^"\s]+)"\s+"([^"]*)"', re.IGNORECASE | re.MULTILINE) | ||
+ | regexNoLang = compileRegex(r'^"([^[][^"\s]+)"\s+"([^"]*)"', re.IGNORECASE | re.MULTILINE) | ||
+ | for l in content: | ||
+ | l = u(l.strip()) | ||
+ | curlang = None | ||
+ | key, value = None, None | ||
+ | langRes = regexLang.search(l) | ||
+ | if langRes: | ||
+ | curlang = u(langRes.group(1)) | ||
+ | key, value = langRes.group(2), langRes.group(3) | ||
+ | else: | ||
+ | langRes = regexNoLang.search(l) | ||
+ | if langRes: | ||
+ | curlang = language | ||
+ | key, value = langRes.group(1), langRes.group(2) | ||
+ | if curlang is not None: | ||
+ | if u(key) not in languages: | ||
+ | languages[u(key)] = {} | ||
+ | languages[u(key)][curlang] = u(value) | ||
+ | else: | ||
+ | pass | ||
+ | return languages | ||
+ | def languagesFilter(languages, commonto=None, prefix=None, suffix=None, exceptions=[]): | ||
+ | filtered = {} | ||
+ | for k in languages: | ||
+ | if k in exceptions: | ||
+ | continue | ||
+ | if commonto is not None: | ||
+ | doit = True | ||
+ | for i in commonto: | ||
+ | if i not in languages[k]: | ||
+ | doit = False | ||
+ | break | ||
+ | if not doit: | ||
+ | continue | ||
+ | if prefix is not None: | ||
+ | doit = False | ||
+ | for i in prefix: | ||
+ | if k.lower()[:len(i)] == i.lower(): | ||
+ | doit = True | ||
+ | break | ||
+ | if not doit: | ||
+ | continue | ||
+ | if suffix is not None: | ||
+ | doit = False | ||
+ | for i in suffix: | ||
+ | if k.lower()[-len(i):] == i.lower(): | ||
+ | doit = True | ||
+ | break | ||
+ | if not doit: | ||
+ | continue | ||
+ | filtered[u(k)] = languages[k] | ||
+ | return filtered | ||
+ | def readLocaleFile(f): | ||
+ | return u(f.decode('utf16')) | ||
+ | def associateLocaleWordFilters(languages, fromLang, toLang, targetPageLang=None): | ||
+ | for a in languages: | ||
+ | f = wordFilter(languages[a][toLang], languages[a][fromLang]) | ||
+ | if targetPageLang is None: | ||
+ | addSafeFilter(f) | ||
+ | else: | ||
+ | addSafeFilter(f, language=targetPageLang) | ||
+ | def getRandBits(): | ||
+ | return random.getrandbits(128) | ||
+ | def getFileHash(filename): | ||
+ | h = hashlib.md5() | ||
+ | f = open(filename, 'rb') | ||
+ | for i in f.readlines(): | ||
+ | h.update(i) | ||
+ | f.close() | ||
+ | return u(h.hexdigest()) | ||
+ | def deleteFile(*fs): | ||
+ | for f in fs: | ||
+ | try: | ||
+ | os.remove(f) | ||
+ | except: | ||
+ | pass | ||
+ | def programExists(programName): | ||
+ | try: | ||
+ | result = subprocess.call(['which', programName]) | ||
+ | return result == 0 | ||
+ | except: | ||
+ | return False | ||
def run(): | def run(): | ||
global config | global config | ||
print 'Bot started.' | print 'Bot started.' | ||
loadPage(config['pages']['filters']) | loadPage(config['pages']['filters']) | ||
− | fixPage( | + | for p in sys.argv[1:]: |
− | + | print 'Forced update to', p, '...' | |
+ | fixPage(p) | ||
loadBlacklist() | loadBlacklist() | ||
patrolChanges() | patrolChanges() | ||
Line 595: | Line 1,249: | ||
doPageRequests(force=False) | doPageRequests(force=False) | ||
updateEditCount() | updateEditCount() | ||
+ | runScheduledTasks() | ||
+ | try: | ||
+ | import rcNotify | ||
+ | rcNotify.main(once=True) | ||
+ | except: | ||
+ | pass | ||
try: | try: | ||
subprocess.Popen(['killall', 'cpulimit']).communicate() | subprocess.Popen(['killall', 'cpulimit']).communicate() | ||
Line 600: | Line 1,260: | ||
pass | pass | ||
print 'All done.' | print 'All done.' | ||
− | run()</pre> | + | if __name__ == '__main__': |
+ | run()</nowiki></pre> |
Latest revision as of 22:08, 23 March 2012
#!/usr/bin/python -OO # -*- coding: utf-8 -*- # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import sys import os import time, datetime import re import hashlib import urllib2 import tempfile import traceback import threading import random import subprocess import cStringIO as StringIO import shutil import wikitools import wikiUpload import feedparser try: import steam except: steam = None from wikiUpload import wikiUploader from botConfig import config if steam is not None and 'steamAPI' in config: steam.set_api_key(config['steamAPI']) config['runtime'] = { 'rcid': -1, 'onlinercid': -1, 'wiki': None, 'edits': 0, 'regexes': {}, 'pages': {}, 'uploader': wikiUploader(config['username'], config['password'], config['api']) } def u(s): if type(s) is type(u''): return s if type(s) is type(''): try: return unicode(s) except: try: return unicode(s.decode('utf8')) except: try: return unicode(s.decode('windows-1252')) except: return unicode(s, errors='ignore') try: return unicode(s) except: try: return u(str(s)) except: return s class curry: def __init__(self, func, *args, **kwargs): self.func = func self.pending = args[:] self.kwargs = kwargs def __str__(self): if u(self.func) == u(regSub): return 'Regex' + u(self.pending) return u'<Curry of ' + u(self.func) + u'; args = ' + u(self.pending) + u'; kwargs = ' + u(self.kwargs) + u'>' def __repr__(self): return self.__str__() def __call__(self, *args, **kwargs): if kwargs and self.kwargs: kw = self.kwargs.copy() kw.update(kwargs) else: kw = kwargs or self.kwargs return self.func(*(self.pending + args), **kw) class BatchScheduler: def __init__(self, concurrency=16): self.concurrency = 16 self.tasks = [] def schedule(self, target, *args, **kwargs): self.tasks.append((target, args, kwargs)) def execute(self): while len(self.tasks): pool = [] numThreads = min(self.concurrency, len(self.tasks)) for task in range(numThreads): task = self.tasks[task] t = threading.Thread(target=task[0], args=task[1], kwargs=task[2]) t.start() pool.append(t) for t in pool: t.join() self.tasks = self.tasks[numThreads:] def getTempFilename(extension=None): global config if extension is None: f = tempfile.mkstemp(prefix=config['tempPrefix']) else: f = tempfile.mkstemp(suffix=u'.' + u(extension), prefix=config['tempPrefix']) os.close(f[0]) # Damn you Python I just want a filename return u(f[1]) def wiki(): global config if config['runtime']['wiki'] is None: config['runtime']['wiki'] = wikitools.wiki.Wiki(config['api']) print 'Logging in as', config['username'], '...' config['runtime']['wiki'].login(config['username'], config['password']) try: config['runtime']['onlinercid'] = int(u(wikitools.page.Page(wiki(), config['pages']['rcid']).getWikiText()).strip()) config['runtime']['rcid'] = config['runtime']['onlinercid'] except: error('Couldn\'t read RCID.') print 'Logged in.' return config['runtime']['wiki'] def page(p): global config if type(p) in (type(''), type(u'')): p = u(p) if p not in config['runtime']['pages']: config['runtime']['pages'][p] = wikitools.page.Page(wiki(), p, followRedir=False) return config['runtime']['pages'][p] # Else, it is a page object title = u(p.title) if title not in config['runtime']['pages']: config['runtime']['pages'][title] = p return config['runtime']['pages'][title] def getSummary(summary): summary = u(summary) while len(summary) > 250: if summary.find(u' ') == -1: summary = summary[:summary.rfind(u' ')] + u'...' else: summary = summary[:247] + u'...' return summary def editPage(p, content, summary=u'', minor=True, bot=True, nocreate=True): global config summary = getSummary(summary) p = page(p) try: print 'Editing', p.title, 'with summary', summary except: pass try: if nocreate: if minor: result = p.edit(u(content), summary=summary, minor=True, bot=bot, nocreate=nocreate) else: result = p.edit(u(content), summary=summary, notminor=True, bot=bot, nocreate=nocreate) else: if minor: result = p.edit(u(content), summary=summary, minor=True, bot=bot) else: result = p.edit(u(content), summary=summary, notminor=True, bot=bot) except: warning('Couldn\'t edit', p.title) return None try: if result['edit']['result']: config['runtime']['edits'] += 1 except: warning('Couldn\'t edit', p.title) return result def deletePage(p, summary=False): if summary: summary = getSummary(summary) return page(p).delete(summary) def uploadFile(filename, destfile, pagecontent='', license='', overwrite=False, reupload=False): global config return config['runtime']['uploader'].upload(filename, destfile, pagecontent, license, overwrite=overwrite, reupload=reupload) def updateRCID(): if abs(config['runtime']['rcid'] - config['runtime']['onlinercid']) >= config['rcidrate']: print 'Updating last RCID...' try: editPage(config['pages']['rcid'], config['runtime']['rcid'], summary=u'Updated Recent Changes log position to ' + u(config['runtime']['rcid'])) config['runtime']['onlinercid'] = config['runtime']['rcid'] except: warning('Couldn\'t update RCID.') def updateEditCount(force=False): global config if not config['runtime']['edits']: return if not force and random.randint(0, 40) != 7: return try: editPage(config['pages']['editcount'], int(wikitools.api.APIRequest(wiki(), { 'action': 'query', 'list': 'users', 'usprop': 'editcount', 'ususers': config['username'] }).query(querycontinue=False)['query']['users'][0]['editcount']) + 1, summary=u'Updated edit count.') config['runtime']['edits'] = 0 except: warning('Couldn\'t update edit count.') # Because SOME LIBRARY will not use singletons, this has to be done at the bot level # rather than the individual filter level to avoid loading the damn thing twice. steamGameSchemas = {} def steamGetGameSchema(game): global steamGameSchemas if steam is None: return None if game not in steamGameSchemas: steamGameSchemas[game] = game.item_schema() return steamGameSchemas[game] steamGameAssets = {} def steamGetGameAssets(game): global steamGameAssets if steam is None: return None if game not in steamGameAssets: steamGameAssets[game] = game.assets() return steamGameAssets[game] def compileRegex(regex, flags=re.IGNORECASE): global config regex = u(regex) if regex in config['runtime']['regexes']: return config['runtime']['regexes'][regex] config['runtime']['regexes'][regex] = re.compile(regex, flags) return config['runtime']['regexes'][regex] def warning(*info): s = [] print info import traceback traceback.print_exc() def error(*info): warning(*info) sys.exit(1) def setFilterName(f, name): name = u(name) f.__unicode__ = lambda: name f.__str__ = lambda: name.encode('utf8') f.filterName = name return f class link: def __init__(self, content): content = u(content) self.joined = False self.setBody(content) self.setType(u'unknown') self.setLabel(None) self.setLink(u'') self.anchor = None self.joined = False if len(content) > 2: if content[:2] == u'[[' and content[-2:] == u']]': split = content[2:-2].split(u'|') if len(split) in (1, 2): self.setType(u'internal') lnk = split[0] if lnk.find(u':') == -1: lnk = lnk.replace(u'_', u' ') anchor = None if lnk.find(u'#') != -1: lnk, anchor = lnk.split(u'#', 1) self.setAnchor(anchor) self.setLink(lnk) if len(split) == 2: self.setLabel(split[1]) else: self.setLabel(split[0]) self.joined = anchor is None elif content[0] == u'[' and content[-1] == u']': split = content[1:-1].split(u' ', 1) self.setType(u'external') self.setLink(split[0]) if len(split) == 2: self.setLabel(split[1]) else: self.setLabel(None) def getType(self): return u(self.kind) def getBody(self): return u(self.body) def getLink(self, withAnchor=False): if withAnchor and self.getAnchor() is not None: return u(self.link) + u'#' + self.getAnchor() return u(self.link) def getAnchor(self): return self.anchor def getLabel(self): if self.label is None: return None if self.joined: return self.getLink() return u(self.label) def setType(self, kind): self.kind = u(kind) def setBody(self, body): self.body = u(body) def setLink(self, link): link = u(link) if self.getType() == u'internal' and link.find(u'#') != -1: link, anchor = link.split(u'#', 1) self.setAnchor(anchor) self.link = link if self.joined: self.label = u(link) replaceDots = compileRegex(r'(?:\.[a-f\d][a-f\d])+') def _replaceDots(self, g): s = '' g = g.group(0) for i in xrange(0, len(g), 3): s += chr(int(g[i + 1:i + 3], 16)) return s.decode('utf8') def setAnchor(self, anchor): if self.getType() == u'internal': u(anchor).replace(u'_', u' ') try: anchor = link.replaceDots.sub(self._replaceDots, anchor) except: pass self.anchor = anchor def setLabel(self, label): if label is None: self.label = None else: self.label = u(label) if self.joined: self.link = u(label) def __str__(self): return self.__unicode__() def __repr__(self): return u'<Link-' + self.getType() + u': ' + self.__unicode__() + u'>' def __unicode__(self): label = self.getLabel() tmpLink = self.getLink(withAnchor=True) if self.getType() == u'internal': tmpLink2 = tmpLink.replace(u'_', u' ') if label in (tmpLink2, tmpLink) or (label and tmpLink and (label[0].lower() == tmpLink[0].lower() and tmpLink[1:] == label[1:]) or (label[0].lower() == tmpLink2[0].lower() and tmpLink2[1:] == label[1:])): return u'[[' + label + u']]' elif tmpLink and label and len(label) > len(tmpLink) and (label.lower().find(tmpLink2.lower()) == 0 or label.lower().find(tmpLink.lower()) == 0): index = max(label.lower().find(tmpLink2.lower()), label.lower().find(tmpLink.lower())) badchars = (u' ', u'_') nobadchars = True for c in badchars: if label[:index].find(c) != -1 or label[index+len(tmpLink):].find(c) != -1: nobadchars = False if nobadchars: return label[:index] + u(link(u'[[' + tmpLink + u'|' + label[index:index+len(tmpLink)] + u']]')) + label[index+len(tmpLink):] return u'[[' + tmpLink + u'|' + label + u']]' if self.getType() == u'external': if label is None: return u'[' + tmpLink + u']' return u'[' + tmpLink + u' ' + label + u']' return self.getBody() class template: maxInlineParams = 1 def __init__(self, content): content = u(content) self.changed = False self.content = content self.name = None self.order = None self.links = [] self.params = [] self.paramNum = 0 self.indentation = {} self.defaultIndent = 0 if len(content) > 4 and content[:2] == '{{' and content[-2:] == '}}': innerRegex = compileRegex(r'\s*\|\s*') itemRegex = compileRegex(r'^(\S[^=]*?)\s*=\s*([\s\S]*?)$') content = content[2:-2] content, self.links = linkExtract(content) innerStuff = innerRegex.split(content) if innerStuff[0][:9].lower() == 'template:': innerStuff[0] = innerStuff[0][9:] self.name = u(innerStuff[0][0].upper() + innerStuff[0][1:]).replace(u'_', u' ').strip() innerStuff = innerStuff[1:] for i in innerStuff: i = linkRestore(i.strip(), self.links, restore=True) itemRes = itemRegex.search(i) if itemRes: self.params.append((u(itemRes.group(1)).lower(), u(itemRes.group(2)))) else: self.paramNum += 1 self.params.append((u(self.paramNum), i)) self.originalContent = self.content self.originalParams = self.params[:] self.originalName = self.name self.forceindent = False def indentationMatters(self, doesitmatter=True): self.forceindent = doesitmatter def getName(self): return self.name def setName(self, name): self.name = u(name).replace(u'_', u' ') self.changed = self.changed or self.name != self.originalName def getParam(self, key): key = u(key).lower() for k, v in self.params: if k == key: return v return None def delParam(self, *indexes): for index in indexes: index = u(index) isNumber = self.isInt(index) for p in range(len(self.params)): k, v = self.params[p] if k == index: self.changed = True self.params = self.params[:p] + self.params[p+1:] if isNumber: if int(index) == self.paramNum: self.paramNum -= 1 break def setParam(self, index=None, value=u''): if value is None: return self.delParam(index) if index is None: index = u(self.paramNum) else: index = u(index).lower() isNumber = self.isInt(index) value = u(value) hasChanged = False for p in range(len(self.params)): k, v = self.params[p] if k == index: self.changed = self.changed or v != value self.params[p] = (k, value) hasChanged = True break if not hasChanged: if isNumber: while self.numParam < int(index) - 1: self.appendParam(u'') self.params.append((index, value)) self.changed = True def appendParam(self, value=u''): self.paramNum += 1 self.params.append((u(self.paramNum), value)) def setPreferedIndentation(self, index, indent=0): self.indentation[u(index)] = indent self.changed = self.changed or self.forceindent def setDefaultIndentation(self, indent=0): self.defaultIndent = indent self.changed = self.changed or self.forceindent def setPreferedOrder(self, order=None): order2 = [] for o in order: order2.append(u(o)) self.order = order2 oldParams = self.params[:] self.changed = self.changed or self.fixOrder() == oldParams def renameParam(self, oldkey, newkey): oldkey, newkey = u(oldkey).lower(), u(newkey).lower() if oldkey == newkey: return for p in range(len(self.params)): k, v = self.params[p] if k == oldkey: self.params[p] = (newkey, v) self.changed = True break def fixOrder(self): if self.order is None: return self.params newParams = [] doneParams = [] for k in self.order: k = u(k) if self.getParam(k) is not None: newParams.append((k, self.getParam(k))) doneParams.append(k) for k, v in self.params: if k not in doneParams: doneParams.append(k) newParams.append((k, v)) self.params = newParams return self.params def defined(self): return self.name is not None def isInt(self, i): try: return u(int(i)) == u(i) and int(i) > 0 except: return False def __str__(self): return self.__unicode__() def __repr__(self): return u'<Template-' + self.getName() + u': ' + self.__unicode__() + u'>' def __unicode__(self): if not self.defined(): return u'' if not self.changed: return self.originalContent self.fixOrder() params = [self.name] indentMode = len(self.params) > template.maxInlineParams or self.forceindent maxIndent = 0 if indentMode: for k, v in self.params: l = len(k) + self.defaultIndent if k in self.indentation: l = len(k) + self.indentation[k] if not self.isInt(k) and l > maxIndent: maxIndent = l numParam = 1 for k, v in self.params: indent = self.defaultIndent if k in self.indentation and indentMode: indent = self.indentation[k] try: isNumber = u(int(index)) == u(index) and int(index) > 0 except: isNumber = False if indentMode: key = u' ' * indent + u'| ' addKey = True if self.isInt(k): if int(k) == numParam: addKey = False numParam += 1 if addKey: key += k + (u' ' * max(0, maxIndent - len(k) - indent)) + u' = ' else: key = u'' addKey = True if self.isInt(k): if int(k) == numParam: addKey = False numParam += 1 if addKey: key += k + u' = ' params.append(key + v) if indentMode: params = u'\n'.join(params) + u'\n' else: params = u' | '.join(params) return u'{{' + params + u'}}' def linkExtract(content): content = u(content) links1 = compileRegex(r'\[\[([^\[\]]+)\]\]') links2 = compileRegex(r'\[([^\[\]]+)\](?!\])') linkcount = 0 linklist = [] res = links1.search(content) while res: linklist.append(link(res.group())) content = content[:res.start()] + u'~!~!~!~OMGLINK-' + u(linkcount) + u'~!~!~!~' + content[res.end():] linkcount += 1 res = links1.search(content) res = links2.search(content) while res: linklist.append(link(res.group())) content = content[:res.start()] + u'~!~!~!~OMGLINK-' + u(linkcount) + u'~!~!~!~' + content[res.end():] linkcount += 1 res = links2.search(content) return content, linklist def templateExtract(content): content = u(content) templatesR = compileRegex(r'\{\{([^\{\}]+)\}\}') templatecount = 0 templatelist = [] res = templatesR.search(content) while res: templatelist.append(template(res.group())) content = content[:res.start()] + u'~!~!~!~OMGTEMPLATE-' + u(templatecount) + u'~!~!~!~' + content[res.end():] templatecount += 1 res = templatesR.search(content) return content, templatelist def blankAround(content, search, repl=u''): content = u(content) search = u(search) repl = u(repl) blank = compileRegex(u'(\\s*)' + u(re.escape(search)) + u'(\\s*)') res = blank.search(content) if not res: return content.replace(search, repl) if u(res.group(0)) == content: return repl if len(res.group(1)) < len(res.group(2)): return content[:res.end(1)] + content[res.end(2):] else: return content[:res.start()] + content[res.start(2):] def linkRestore(content, links=[], restore=False): linklist=links[:] linkcount = len(linklist) i = 0 linklist.reverse() for l in linklist: i += 1 if l is None: content = blankAround(content, u'~!~!~!~OMGLINK-' + u(linkcount - i) + u'~!~!~!~', u'') else: if restore: l = l.getBody() content = content.replace(u'~!~!~!~OMGLINK-' + u(linkcount - i) + u'~!~!~!~', u(l)) return content def templateRestore(content, templatelist=[]): templatecount = len(templatelist) i = 0 templatelist.reverse() for t in templatelist: i += 1 if t is None: content = blankAround(content, u'~!~!~!~OMGTEMPLATE-' + u(templatecount - i) + u'~!~!~!~', u'') else: content = content.replace(u'~!~!~!~OMGTEMPLATE-' + u(templatecount - i) + u'~!~!~!~', u(t)) return content def safeContent(content): safelist = [] tags = compileRegex(r'<(?:ref|gallery|pre|code)[^<>]*>[\S\s]*?</(?:ref|gallery|pre|code)>', re.IGNORECASE | re.MULTILINE) comments = compileRegex(r'<!--[\S\s]*?-->') tagcount = 0 res = tags.search(content) if not res: res = comments.search(content) while res: safelist.append(('~!~!~!~OMGTAG-' + u(tagcount) + u'~!~!~!~', u(res.group()))) content = content[:res.start()] + u'~!~!~!~OMGTAG-' + u(tagcount) + u'~!~!~!~' + content[res.end():] tagcount += 1 res = tags.search(content) if not res: res = comments.search(content) return content, safelist def safeContentRestore(content, safelist=[]): safelist.reverse() for s in safelist: content = content.replace(s[0], s[1]) return content def regReplaceCallBack(sub, match): groupcount = 1 for g in match.groups(): if g is not None: sub = sub.replace(u'$' + u(groupcount), g) else: sub = sub.replace(u'$' + u(groupcount), u'') groupcount += 1 return sub def regSub(regexes, content, **kwargs): content = u(content) for regex in regexes.keys(): if type(regex) in (type(()), type([])): compiled = compileRegex(u(regex[0]), regex[1]) else: compiled = compileRegex(u(regex), re.IGNORECASE | re.DOTALL | re.MULTILINE) callback = curry(regReplaceCallBack, u(regexes[regex])) oldcontent = u'' while content != oldcontent: oldcontent = content content = compiled.sub(callback, content) return u(content) def dumbReplacement(strings, content, **kwargs): content = u(content) for s in strings.keys(): content = content.replace(u(s), u(strings[s])) return content def filterEnabled(f, **kwargs): if type(f) is not type(()): return True if len(f) < 2: return True if type(f[1]) is not type({}): return True article = None if 'article' in kwargs.keys(): article = kwargs['article'] if article is None: return True if type(article) not in (type(u''), type('')): article = article.title if article is None: return True if 'languageBlacklist' in f[1].keys(): for i in f[1]['languageBlacklist']: if compileRegex(u'/' + u(i) + u'$').search(u(article)): return False return True if 'languageWhitelist' in f[1].keys(): for i in f[1]['languageWhitelist']: if compileRegex(u'/' + u(i) + u'$').search(u(article)): return True return False if 'language' in f[1].keys(): return compileRegex(u'/' + u(f[1]['language']) + u'$').search(u(article)) return True scheduledTasks = [] def scheduleTask(task, oneinevery): global scheduledTasks result = random.randint(0, oneinevery-1) print 'Task:', task, '; result:', result if not result: scheduledTasks.append(task) def runScheduledTasks(): global scheduledTasks if not len(scheduledTasks): print 'No tasks scheduled.' return print 'Running scheduled tasks...' for t in scheduledTasks: print 'Running task:', t try: t() print 'End of task:', t except: print 'Error while executing task:', t def sFilter(filters, content, returnActive=False, **kwargs): content = u(content) lenfilters = len(filters) if not lenfilters: if returnActive: return content, [] return content filtercount = 0 activeFilters = [] for f in filters: if not filterEnabled(f, **kwargs): continue if type(f) is type(()): f, params = f filtercount += 1 #print 'Filter', f, '(', filtercount, '/', lenfilters, ')' loopTimes = 0 beforeFilter = u'' while not loopTimes or beforeFilter != content: loopTimes += 1 if loopTimes >= config['filterPasses']: print 'Warning: More than', config['filterPasses'], 'loops with filter', u(f) break beforeFilter = content content = u(f(content, **kwargs)) if content != beforeFilter and f not in activeFilters: activeFilters.append(f) if returnActive: return content, activeFilters return content def linkFilter(filters, linklist, returnActive=False, **kwargs): activeFilters = [] for f in filters: if not filterEnabled(f, **kwargs): continue if type(f) is type(()): f, params = f for i in range(len(linklist)): if linklist[i] is not None and isinstance(linklist[i], link): oldLink = u(linklist[i]) linklist[i] = f(linklist[i], **kwargs) if oldLink != u(linklist[i]) and f not in activeFilters: activeFilters.append(f) if returnActive: return linklist, activeFilters return linklist def templateFilter(filters, templatelist, returnActive=False, **kwargs): activeFilters = [] for f in filters: if not filterEnabled(f, **kwargs): continue if type(f) is type(()): f, params = f for i in range(len(templatelist)): if templatelist[i] is not None and isinstance(templatelist[i], template): oldTemplate = u(templatelist[i]) templatelist[i] = f(templatelist[i], **kwargs) if oldTemplate != u(templatelist[i]) and f not in activeFilters: activeFilters.append(f) if returnActive: return templatelist, activeFilters return templatelist def linkTextFilter(subfilters, l, linksafe=False, **kwargs): if l.getType() == u'internal' and l.getLink().find(u':') == -1 and pageFilter(l.getLink()): if linksafe: l.setLink(sFilter(subfilters, l.getLink(), **kwargs)) if l.getLabel().find(u':') == -1: l.setLabel(sFilter(subfilters, l.getLabel(), **kwargs)) return l def linkDomainSub(fromDomain, toDomain, link, **kwargs): domainR = compileRegex(r'^(https?://(?:[-\w]+\.)*)' + u(re.escape(fromDomain)) + r'(\S+)$') toDomain = u(toDomain) if link.getType() == 'external': linkInfo = domainR.search(link.getLink()) if linkInfo: link.setLink(u(linkInfo.group(1)) + toDomain + u(linkInfo.group(2))) return link def linkDomainFilter(fromDomain, toDomain): return curry(linkDomainSub, fromDomain, toDomain) def regexes(rs): return curry(regSub, rs) def regex(reg, replace): return regexes({reg: replace}) def dumbReplaces(rs): return setFilterName(curry(dumbReplacement, rs), u'DumbReplacements(' + u(rs) + u')') def dumbReplace(subject, replacement): return setFilterName(dumbReplaces({subject: replacement}), u'DumbReplacement(' + u(subject) + u' \u2192 ' + u(replacement) + u')') def wordRegex(word, **kwargs): flags = None if type(word) in (type(()), type([])): flags = word[1] word = word[0] word = u(re.sub(r'[-_ ]+', r'[-_ ]', u(word))) word = u(r"(?<![\u00E8-\u00F8\xe8-\xf8\w])(?<!'')(?<!" + r'"' + ")(?:\\b|(?<=[ \\[\\]\\(\\):;.,\"'*\\xab\\xbb])|^)" + word + r"(?:\b(?![\u00E8-\u00F8\xe8-\xf8\w])(?!''|" + r'"' + ")|(?=[ \\[\\]\(\\):;.,\"'*\\xab\\xbb])|$)") if flags is None: return word return (word, flags) def wordFilter(correct, *badwords, **kwargs): correct = u(correct) rs = {} badwords2 = [] for i in badwords: if type(i) in (type(()), type([])): badwords2.extend(map(u, i)) else: badwords2.append(u(i)) if not len(badwords2): badwords2.append(correct) for w in badwords2: if 'keepcapitalization' in kwargs and kwargs['keepcapitalization']: if type(w) not in (type(()), type([])): w = (w, 0) else: w = (w[0], 0) rs[wordRegex(w, **kwargs)] = correct rs[wordRegex((w[0][0].swapcase() + w[0][1:], w[1]), **kwargs)] = correct[0].swapcase() + correct[1:] else: rs[wordRegex(w, **kwargs)] = correct return setFilterName(regexes(rs), u'WordFilter(' + u'/'.join(badwords2) + u' \u2192 ' + correct + u')') def enforceCapitalization(*words, **kwargs): for w in words: addSafeFilter(setFilterName(wordFilter(u(w)), u'EnforceCapitalization(' + u(w) + u')'), **kwargs) pageFilters = [] pageWhitelist = [] categoryFilters = [] def pageFilter(page): global pageFilters, pageWhitelist if type(page) in (type(()), type([])): pages = [] for p in page: if pageFilter(p): pages.append(p) return pages if type(page) not in (type(u''), type('')): page = page.title page = u(page) if page in pageWhitelist: return True for f in pageFilters: if f.search(page): return False return True def categoryFilter(page): global categoryFilters pageCategories = page.getCategories() for c in pageCategories: if u(c).replace(u'_', ' ') in categoryFilters: return False return True def addPageFilter(*filters): global pageFilters for f in filters: pageFilters.append(compileRegex(f)) def addBlacklistPage(*pages): for p in pages: addPageFilter(re.escape(u(p))) def addWhitelistPage(*pages): global pageWhitelist for p in pages: if type(p) in (type([]), type(())): addWhitelistPage(*p) elif u(p) not in pageWhitelist: pageWhitelist.append(u(p)) def addBlacklistCategory(*categories): global categoryFilters for c in categories: categoryFilters.append(u(c).replace(u'_', ' ')) def loadBlacklist(): global config for l in page(config['pages']['blacklist']).getLinks(): l = u(l) if l.find(u':') != -1: if l[:l.find(u':')].lower() == 'category': addBlacklistCategory(l) continue addBlacklistPage(l) filters = { 'regular': [], 'safe': [], 'link': [], 'template': [], 'file': [] } def addFilterType(filterType, *fs, **kwargs): global filters for f in fs: f = (f, kwargs) if f not in filters[filterType]: filters[filterType].append(f) def delFilterType(filterType, *fs, **kwargs): global filters for f in fs: f = (f, kwargs) if f in filters[filterType]: filters[filterType].remove(f) def addFilter(*fs, **kwargs): addFilterType('regular', *fs, **kwargs) def delFilter(*fs, **kwargs): delFilterType('regular', *fs, **kwargs) def addSafeFilter(*fs, **kwargs): addFilterType('safe', *fs, **kwargs) def delSafeFilter(*fs, **kwargs): delFilterType('safe', *fs, **kwargs) def addLinkFilter(*fs, **kwargs): addFilterType('link', *fs, **kwargs) def delLinkFilter(*fs, **kwargs): delFilterType('link', *fs, **kwargs) def addTemplateFilter(*fs, **kwargs): addFilterType('template', *fs, **kwargs) def delTemplateFilter(*fs, **kwargs): delFilterType('template', *fs, **kwargs) def addFileFilter(*fs, **kwargs): addFilterType('file', *fs, **kwargs) def delFileFilter(*fs, **kwargs): delFilterType('file', *fs, **kwargs) def filterRepr(filters): s = [] reprRegex = compileRegex(r'^<function (\S+)') for f in filters: if type(f) in (type([]), type(())) and not len(f[1]): f = f[0] # Omit parameters if there are none try: name = f.filterName s.append(name) except: res = reprRegex.search(u(f)) if res: filterR = u(res.group(1)) if filterR not in s: s.append(filterR) elif u(f) not in s: s.append(u(f)) if not len(s): return u'Built-in filters' # Link simplification, template formatting, etc return u', '.join(s) def fixContent(content, article=None, returnActive=False, **kwargs): global filters content = u(content) oldcontent = u'' loopTimes = 0 redirect = False activeFilters = [] if len(content) > 9: redirect = content[:9] == u'#REDIRECT' if article is not None: article = page(article) while not loopTimes or content != oldcontent: loopTimes += 1 if loopTimes > 2: print 'Pass', loopTimes, 'on', article if loopTimes >= config['pagePasses']: print 'Warning: More than', config['pagePasses'], 'fix passes on article', u(article.title) break oldcontent = content # Apply unsafe filters content, activeF = sFilter(filters['regular'], content, returnActive=True, article=article, redirect=redirect) activeFilters.extend(activeF) # Apply safe filters content, safelist = safeContent(content) content, templatelist = templateExtract(content) content, linklist = linkExtract(content) content, activeF = sFilter(filters['safe'], content, returnActive=True, article=article, redirect=redirect) activeFilters.extend(activeF) extraLinks = setFilterName(curry(linkTextFilter, filters['safe']), u'(Content filters applied to links)') addLinkFilter(extraLinks) if not redirect: linklist, activeF = linkFilter(filters['link'], linklist, returnActive=True, article=article, redirect=redirect) activeFilters.extend(activeF) content = linkRestore(content, linklist) templatelist, activeF = templateFilter(filters['template'], templatelist, returnActive=True, article=article, redirect=redirect) activeFilters.extend(activeF) content = templateRestore(content, templatelist) content = safeContentRestore(content, safelist) delLinkFilter(extraLinks) if article is not None and u(article.title)[:5] == 'File:': # Apply file filters content, activeF = sFilter(filters['file'], content, returnActive=True, article=article, redirect=redirect) activeFilters.extend(activeF) if returnActive: return content, activeFilters return content def fixPage(article, **kwargs): article = page(article) force = False if 'force' in kwargs and kwargs['force']: force = True try: catFilter = categoryFilter(article) except wikitools.page.NoPage: print 'No such page:', article return False except: catFilter = True if not force and (not pageFilter(article) or not catFilter): print 'Skipping:', article return originalContent = u(article.getWikiText()) content, activeFilters = fixContent(originalContent, returnActive=True, article=article) if content != originalContent: print article, 'needs to be updated.' summary = u'Auto: ' + filterRepr(activeFilters) if 'reason' in kwargs: summary += u' (' + u(kwargs['reason']) + u')' if 'fake' in kwargs: print '-------- New content is: --------' print content print '---------------------------------' else: editPage(article, content, summary=summary) return True print article, 'is up-to-date.' return False def patrol(change): global config secondsElapsed = (datetime.datetime.utcnow() - datetime.datetime.fromtimestamp(time.mktime(time.strptime(change['timestamp'], r'%Y-%m-%dT%H:%M:%SZ')))) totalTime = secondsElapsed.seconds + secondsElapsed.days * 86400 if int(change['rcid']) <= config['runtime']['rcid'] or not pageFilter(change['title']) or totalTime <= config['freshnessThreshold']: print 'Skipping', change['rcid'], change['title'] if int(change['rcid']) > config['runtime']['rcid']: config['runtime']['rcid'] = int(change['rcid']) return print 'Patrolling', change['title'] config['runtime']['rcid'] = int(change['rcid']) result = fixPage(change['title'], reason=u'Review RC#' + u(change['rcid'])) updateRCID() def loadPage(p): p = page(p) try: code = u(p.getWikiText()) except: error('Couldn\'t grab page', p) coderegex = compileRegex(r'^(?: [^\r\n]*(?:[\r\n]+|$))+', re.MULTILINE) trimcode = compileRegex(r'^ |</?nowiki>', re.MULTILINE) for m in coderegex.finditer(code): try: exec(trimcode.sub(u'', u(m.group()))) except: error('Error while parsing code: ', m.group()) def patrolChanges(): try: recentChanges = wikitools.api.APIRequest(wiki(), { 'action':'query', 'list':'recentchanges', 'rctoken':'patrol', 'rclimit':'500' }).query(querycontinue=False)[u'query'][u'recentchanges'] recentChanges.reverse() except: error('Error while trying to grab recent changes.') uniquePages = [] uniqueChanges = {} for change in recentChanges: if change['title'] not in uniquePages: uniquePages.append(change['title']) if change['title'] not in uniqueChanges or uniqueChanges[change['title']]['rcid'] < change['rcid']: uniqueChanges[change['title']] = change # Move page to end of queue uniquePages.remove(change['title']) uniquePages.append(change['title']) for title in uniquePages: change = uniqueChanges[title] try: patrol(change) except KeyboardInterrupt: error('Interrupted:', change) except: warning('Failed to patrol change:', change) print 'Done patrolling.' def parsePageRequest(l, links=[]): l = u(l) content = [] selfContent = u'* [[:' + l + u']]' if l.find(u':'): if l[:l.find(u':')].lower() == 'category': subpages = wikitools.category.Category(wiki(), l[l.find(u':')+1:]).getAllMembers(titleonly=True) for s in subpages: if s not in links: links.append(s) newLink, links = parsePageRequest(s, links=links) content.append(newLink) if len(content): selfContent += u'\r\n' + u'\r\n'.join(content) return selfContent, links def doPageRequests(force=False): global config print 'Executing page requests. Force =', force if force: requestPageTitle = config['pages']['pagerequestsforce'] else: requestPageTitle = config['pages']['pagerequests'] requestPage = page(requestPageTitle) reqre = compileRegex(r'^\*[\t ]*\[\[:?([^][]+)\]\]', re.MULTILINE) originalRequests = u(requestPage.getWikiText()) requests = originalRequests matches = [] links = [] for m in reqre.finditer(requests): matches.append(m) l = u(m.group(1)) if l not in links: links.append(l) matches.reverse() for m in matches: pagelink, links = parsePageRequest(u(m.group(1)), links=links) requests = requests[:m.start()] + pagelink + requests[m.end():] requests = regSub({r'^[ \t]*(\*[^\r\n]+)[\r\n]+(?=^[ \t]*\*)':'$1\r\n'}, requests) reqre2 = compileRegex(r'^\*[\t ]*\[\[:?([^][]+)\]\]\s*', re.MULTILINE) matches2 = [] requestsDone = 0 tooMany = False for m in reqre2.finditer(requests): requestsDone += 1 if requestsDone > config['maxrequests']: tooMany = True break matches2.append(m) matches2.reverse() tofix = [] for m in matches2: tofix.append(u(m.group(1))) requests = requests[:m.start()] + requests[m.end():] tofix.reverse() for p in tofix: fixPage(p, reason=u'Requested on [[:' + u(requestPageTitle) + u']]', force=force) requests = regSub({r'^[ \t]*(\*[^\r\n]+)[\r\n]+(?=^[ \t]*\*)':'$1\r\n'}, requests) if len(tofix) and originalRequests != requests: if tooMany: editPage(requestPage, requests, summary=u'Processed: [[:' + u']], [[:'.join(tofix) + u']]') else: editPage(requestPage, requests, summary=u'Finished all requests. Processed: [[:' + u']], [[:'.join(tofix) + u']]') def parseLocaleFile(content, language='english', languages={}): content = u(content) language = u(language) if content.find('Tokens') != -1: content = content[content.find('Tokens')+6:] regexSplit = compileRegex('\n(?=\s*")', re.IGNORECASE | re.MULTILINE) content = regexSplit.split(content) regexLang = compileRegex(r'^"\[([-\w]+)\]([^"\s]+)"\s+"([^"]*)"', re.IGNORECASE | re.MULTILINE) regexNoLang = compileRegex(r'^"([^[][^"\s]+)"\s+"([^"]*)"', re.IGNORECASE | re.MULTILINE) for l in content: l = u(l.strip()) curlang = None key, value = None, None langRes = regexLang.search(l) if langRes: curlang = u(langRes.group(1)) key, value = langRes.group(2), langRes.group(3) else: langRes = regexNoLang.search(l) if langRes: curlang = language key, value = langRes.group(1), langRes.group(2) if curlang is not None: if u(key) not in languages: languages[u(key)] = {} languages[u(key)][curlang] = u(value) else: pass return languages def languagesFilter(languages, commonto=None, prefix=None, suffix=None, exceptions=[]): filtered = {} for k in languages: if k in exceptions: continue if commonto is not None: doit = True for i in commonto: if i not in languages[k]: doit = False break if not doit: continue if prefix is not None: doit = False for i in prefix: if k.lower()[:len(i)] == i.lower(): doit = True break if not doit: continue if suffix is not None: doit = False for i in suffix: if k.lower()[-len(i):] == i.lower(): doit = True break if not doit: continue filtered[u(k)] = languages[k] return filtered def readLocaleFile(f): return u(f.decode('utf16')) def associateLocaleWordFilters(languages, fromLang, toLang, targetPageLang=None): for a in languages: f = wordFilter(languages[a][toLang], languages[a][fromLang]) if targetPageLang is None: addSafeFilter(f) else: addSafeFilter(f, language=targetPageLang) def getRandBits(): return random.getrandbits(128) def getFileHash(filename): h = hashlib.md5() f = open(filename, 'rb') for i in f.readlines(): h.update(i) f.close() return u(h.hexdigest()) def deleteFile(*fs): for f in fs: try: os.remove(f) except: pass def programExists(programName): try: result = subprocess.call(['which', programName]) return result == 0 except: return False def run(): global config print 'Bot started.' loadPage(config['pages']['filters']) for p in sys.argv[1:]: print 'Forced update to', p, '...' fixPage(p) loadBlacklist() patrolChanges() updateRCID() doPageRequests(force=True) doPageRequests(force=False) updateEditCount() runScheduledTasks() try: import rcNotify rcNotify.main(once=True) except: pass try: subprocess.Popen(['killall', 'cpulimit']).communicate() except: pass print 'All done.' if __name__ == '__main__': run()