ויקימילון:מפגשים/האקתון ויקימילון/בוט משימות
מראה
בוט זה פותח במהלך המפגש ומעט אחר כך ועובר על כל הדפים בוויקימילון, ומדווח על דפים שמפרים חוקים מסוימים. הבוט מוגש ברישיון חופשי וכולם מוזמנים לשפר אותו.
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import pywikibot
from pywikibot import pagegenerators
import re
import pywikibot.textlib
import os
from pywikibot.xmlreader import XmlDump
import requests
GERSHAIM_REGEX = re.compile('["״]')
def get_dump():
"""
This function downloads teh latest wiktionary dump
"""
# we already have a dump
if os.path.exists('pages-articles.xml.bz2'):
return
# get a new dump
print('Dump doesnt exist locally - downloading...')
r = requests.get('http://dumps.wikimedia.org/hewiktionary/latest/hewiktionary-latest-pages-articles.xml.bz2',
stream=True)
with open('pages-articles.xml.bz2', 'wb') as dump_fd:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
dump_fd.write(chunk)
print('New dump downloaded successfully')
def check_page(page_title, page_text):
"""
This function checks for violations of the common structure in wiktionary.
It report the found issues as string
"""
text_categories = [cat.title(withNamespace=False) for cat in pywikibot.textlib.getCategoryLinks(page_text, site)]
parsed_page_templates = pywikibot.textlib.extract_templates_and_params(page_text)
nituch_dikduki = [template_params for template_name, template_params in parsed_page_templates
if template_name == 'ניתוח דקדוקי']
has_nituch_dikduki = len(nituch_dikduki) > 0
if GERSHAIM_REGEX.findall(page_title) and ('ראשי תיבות' not in text_categories):
return 'דפים עם גרשיים שאינם ראשי תיבות' # FIX: warning for gershaim which aren't in category
elif not GERSHAIM_REGEX.findall(page_title) and ('ראשי תיבות' in text_categories):
return 'דפים עם ראשי תיבות חסרי גרשיים' # FIX: warning for rashi taivut without gershaim
elif not has_nituch_dikduki:
return 'דפים חסרי ניתוח דקדוקי' # FIX: warning of missing nituh dikduki
else:
return None # TODO: need to expend it
get_dump() # download the latest dump if it doesnt exist
site = pywikibot.Site('he', 'wiktionary')
all_wiktionary = XmlDump('pages-articles.xml.bz2').parse() # open the dump and parse it.
# filter only main namespace
all_wiktionary = filter(lambda page: page.ns == '0' and not page.isredirect, all_wiktionary)
# find pages with issues
pages_with_issues = (pywikibot.Page(site, p.title) for p in all_wiktionary if check_page(p.title, p.text) is not None)
# for pages with issues load the latest revision and check again
pages_with_issues = pagegenerators.PreloadingGenerator(pages_with_issues)
# a dictionary where the key is the issue and the value is list of pages violates it
pages_by_issues = dict()
for page in pages_with_issues:
try:
issue = check_page(page.title(), page.get())
if issue not in pages_by_issues:
pages_by_issues[issue] = []
else:
pages_by_issues[issue].append(page.title())
except pywikibot.IsRedirectPage:
continue
except pywikibot.NoPage:
continue
# after going over all pages, report it to a maintenance page so human go over it
for issue, pages in pages_by_issues.items():
report_page = pywikibot.Page(site, 'ויקימילון:תחזוקה/%s' % issue)
report_content = '\n'.join(['* [[%s]]' % p for p in pages])
report_page.put(report_content)