ויקימילון:מפגשים/האקתון ויקימילון/בוט משימות

מתוך ויקימילון, מיזם רב לשוני ליצירת מילון חופשי שיתופי.

בוט זה פותח במהלך המפגש ומעט אחר כך ועובר על כל הדפים בוויקימילון, ומדווח על דפים שמפרים חוקים מסוימים. הבוט מוגש ברישיון חופשי וכולם מוזמנים לשפר אותו.

# -*- coding: utf-8  -*-
from __future__ import unicode_literals
import pywikibot
from pywikibot import pagegenerators
import re
import pywikibot.textlib
import os
from pywikibot.xmlreader import XmlDump
import requests

GERSHAIM_REGEX = re.compile('["״]')


def get_dump():
    """
    This function downloads teh latest wiktionary dump
    """
    # we already have a dump
    if os.path.exists('pages-articles.xml.bz2'):
        return
    # get a new dump
    print('Dump doesnt exist locally - downloading...')
    r = requests.get('http://dumps.wikimedia.org/hewiktionary/latest/hewiktionary-latest-pages-articles.xml.bz2',
                     stream=True)
    with open('pages-articles.xml.bz2', 'wb') as dump_fd:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                dump_fd.write(chunk)
    print('New dump downloaded successfully')


def check_page(page_title, page_text):
    """
    This function checks for violations of the common structure in wiktionary.
    It report the found issues as string
    """
    text_categories = [cat.title(withNamespace=False) for cat in pywikibot.textlib.getCategoryLinks(page_text, site)]
    parsed_page_templates = pywikibot.textlib.extract_templates_and_params(page_text)
    nituch_dikduki = [template_params for template_name, template_params in parsed_page_templates
                      if template_name == 'ניתוח דקדוקי']
    has_nituch_dikduki = len(nituch_dikduki) > 0

    if GERSHAIM_REGEX.findall(page_title) and ('ראשי תיבות' not in text_categories):
        return 'דפים עם גרשיים שאינם ראשי תיבות'  # FIX: warning for gershaim which aren't in category
    elif not GERSHAIM_REGEX.findall(page_title) and ('ראשי תיבות' in text_categories):
        return 'דפים עם ראשי תיבות חסרי גרשיים'  # FIX: warning for rashi taivut without gershaim
    elif not has_nituch_dikduki:
        return 'דפים חסרי ניתוח דקדוקי'  # FIX: warning of missing nituh dikduki
    else:
        return None  # TODO: need to expend it


get_dump()  # download the latest dump if it doesnt exist
site = pywikibot.Site('he', 'wiktionary')


all_wiktionary = XmlDump('pages-articles.xml.bz2').parse()  # open the dump and parse it.

# filter only main namespace
all_wiktionary = filter(lambda page: page.ns == '0' and not page.isredirect, all_wiktionary)

# find pages with issues
pages_with_issues = (pywikibot.Page(site, p.title) for p in all_wiktionary if check_page(p.title, p.text) is not None)

# for pages with issues load the latest revision and check again
pages_with_issues = pagegenerators.PreloadingGenerator(pages_with_issues)

# a dictionary where the key is the issue and the value is list of pages violates it
pages_by_issues = dict()
for page in pages_with_issues:
    try:
        issue = check_page(page.title(), page.get())
        if issue not in pages_by_issues:
            pages_by_issues[issue] = []
        else:
            pages_by_issues[issue].append(page.title())
    except pywikibot.IsRedirectPage:
        continue
    except pywikibot.NoPage:
        continue

# after going over all pages, report it to a maintenance page so human go over it
for issue, pages in pages_by_issues.items():
    report_page = pywikibot.Page(site, 'ויקימילון:תחזוקה/%s' % issue)
    report_content = '\n'.join(['* [[%s]]' % p for p in pages])
    report_page.put(report_content)