用戶:P-bot/康熙字典

#!/usr/bin/python
# -*- coding: utf-8  -*-
import glob,re
import codecs
import wikipedia

def getFileList(path, filelist = []):
	filelist += glob.glob(path)
	return filelist

def getElements(filepath, start = '\n'):
	sfile = codecs.open(filepath, 'r', 'utf-8')
	stext = sfile.read()
	try:
		title = re.findall(u'<span class="kszi">(.*?)</span>',stext)[0] # 标题
		if not title == start and not start == '\n':
			title = None
	except IndexError:
		title = None
	try:
		pinyin = re.findall(u'\u62fc\u97f3:</span>(.*?)[\s\u3000]*<script',stext)[0] # 拼音
	except IndexError:
		pinyin = None
	try:
		cangjie = re.findall(u'\u4ed3\u9889:</span>(.*?)[\s\u3000]*</p>',stext)[0] # 仓颉
	except IndexError:
		cangjie = None
	try:
		category = re.findall(u'\u3010([^\u3010]*?\u90e8)\u3011[\s\u3000]*<span class="kszi">',stext)[0] # 分类
		category = category.replace(u'\u5b57','') # 砍掉“字”字
		if category == u'\u5B50\u90e8':
			category += u'(爾雅)'
	except IndexError:
		category = None
	try:
		text = re.findall(u'<span class="kszi">[\s\S]*?>(.*?)</p>',stext)[0].replace(u'\u3014',u'\uff08').replace(u'\u3015',u'\uff09') # 正文
		text = re.sub(u'([^\u53C8|\u3011|\u898B])\u3010',u'\g<1>\n\n* \u3010',text) # 打上可爱的回车
		text.strip('\n')
		if not text.startswith('*'):
			text = '* ' + text
	except IndexError:
		text = None
	return (title, pinyin, cangjie, text, category)

def creatWikiPage(elements):
	if not elements[0]:
		return False
	cmt = u'僕:立新典'
	enter = '\n\n'
	text = ''
	if elements[1]:
		text += u'* (漢語拼音)' + elements[1] + enter
	if elements[2]:
		text += u'* (倉頡碼)' + elements[2] + enter
	if elements[3]:
		text += elements[3] + enter + u'==據==\n* 《康熙字典》'
	if elements[4]:
		text += enter + '[[Category:' + elements[4] + '|{{subst:SUBPAGENAME}}]]'
	wikipage = wikipedia.Page(wikipedia.getSite(), u'維基大典:維基爾雅/' + elements[0])
	if not wikipage.exists():
		wikipage.put(newtext=text, comment=cmt, minorEdit=False)
		return True

def main():
	after = False
	for arg in wikipedia.handleArgs():
		if arg.startswith('-start:'):
			title = arg[7:]
	filelist = getFileList('E:\www.zdic.net\zd\zi\*')
	filelist = getFileList('E:\www.zdic.net\zd\zi2\*', filelist)
	filelist = getFileList('E:\www.zdic.net\zd\zi3\*', filelist)
	filelist = getFileList('E:\www.zdic.net\zd\zi5\*', filelist)
	for filepath in filelist:
		if not after:
			elements = getElements(filepath, title)
			if elements[0]:
				after = True
				creatWikiPage(elements)
		else:
			elements = getElements(filepath)
			creatWikiPage(elements)

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()