ויקימילון:מפגשים/האקתון ויקימילון/בוט משימות

בוט זה פותח במהלך המפגש ומעט אחר כך ועובר על כל הדפים בוויקימילון, ומדווח על דפים שמפרים חוקים מסוימים. הבוט מוגש ברישיון חופשי וכולם מוזמנים לשפר אותו.
# -*- coding: utf-8  -*-
from __future__ import unicode_literals
import pywikibot
from pywikibot import pagegenerators
import re
import pywikibot.textlib
import os
from pywikibot.xmlreader import XmlDump
import requests

GERSHAIM_REGEX = re.compile('["״]')


def get_dump():
    """
    This function downloads teh latest wiktionary dump
    """
    # we already have a dump
    if os.path.exists('pages-articles.xml.bz2'):
        return
    # get a new dump
    print('Dump doesnt exist locally - downloading...')
    r = requests.get('http://dumps.wikimedia.org/hewiktionary/latest/hewiktionary-latest-pages-articles.xml.bz2',
                     stream=True)
    with open('pages-articles.xml.bz2', 'wb') as dump_fd:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                dump_fd.write(chunk)
    print('New dump downloaded successfully')


def check_page(page_title, page_text):
    """
    This function checks for violations of the common structure in wiktionary.
    It report the found issues as string
    """
    text_categories = [cat.title(withNamespace=False) for cat in pywikibot.textlib.getCategoryLinks(page_text, site)]
    parsed_page_templates = pywikibot.textlib.extract_templates_and_params(page_text)
    nituch_dikduki = [template_params for template_name, template_params in parsed_page_templates
                      if template_name == 'ניתוח דקדוקי']
    has_nituch_dikduki = len(nituch_dikduki) > 0

    if GERSHAIM_REGEX.findall(page_title) and ('ראשי תיבות' not in text_categories):
        return 'דפים עם גרשיים שאינם ראשי תיבות'  # FIX: warning for gershaim which aren't in category
    elif not GERSHAIM_REGEX.findall(page_title) and ('ראשי תיבות' in text_categories):
        return 'דפים עם ראשי תיבות חסרי גרשיים'  # FIX: warning for rashi taivut without gershaim
    elif not has_nituch_dikduki:
        return 'דפים חסרי ניתוח דקדוקי'  # FIX: warning of missing nituh dikduki
    else:
        return None  # TODO: need to expend it


get_dump()  # download the latest dump if it doesnt exist
site = pywikibot.Site('he', 'wiktionary')


all_wiktionary = XmlDump('pages-articles.xml.bz2').parse()  # open the dump and parse it.

# filter only main namespace
all_wiktionary = filter(lambda page: page.ns == '0' and not page.isredirect, all_wiktionary)

# find pages with issues
pages_with_issues = (pywikibot.Page(site, p.title) for p in all_wiktionary if check_page(p.title, p.text) is not None)

# for pages with issues load the latest revision and check again
pages_with_issues = pagegenerators.PreloadingGenerator(pages_with_issues)

# a dictionary where the key is the issue and the value is list of pages violates it
pages_by_issues = dict()
for page in pages_with_issues:
    try:
        issue = check_page(page.title(), page.get())
        if issue not in pages_by_issues:
            pages_by_issues[issue] = []
        else:
            pages_by_issues[issue].append(page.title())
    except pywikibot.IsRedirectPage:
        continue
    except pywikibot.NoPage:
        continue

# after going over all pages, report it to a maintenance page so human go over it
for issue, pages in pages_by_issues.items():
    report_page = pywikibot.Page(site, 'ויקימילון:תחזוקה/%s' % issue)
    report_content = '\n'.join(['* [[%s]]' % p for p in pages])
    report_page.put(report_content)