#!/usr/bin/python
# -*- coding: utf-8 -*-
import glob,re
import codecs
import wikipedia
def getFileList(path, filelist = []):
filelist += glob.glob(path)
return filelist
def getElements(filepath, start = '\n'):
sfile = codecs.open(filepath, 'r', 'utf-8')
stext = sfile.read()
try:
title = re.findall(u'<span class="kszi">(.*?)</span>',stext)[0] # 标题
if not title == start and not start == '\n':
title = None
except IndexError:
title = None
try:
pinyin = re.findall(u'\u62fc\u97f3:</span>(.*?)[\s\u3000]*<script',stext)[0] # 拼音
except IndexError:
pinyin = None
try:
cangjie = re.findall(u'\u4ed3\u9889:</span>(.*?)[\s\u3000]*</p>',stext)[0] # 仓颉
except IndexError:
cangjie = None
try:
category = re.findall(u'\u3010([^\u3010]*?\u90e8)\u3011[\s\u3000]*<span class="kszi">',stext)[0] # 分类
category = category.replace(u'\u5b57','') # 砍掉“字”字
if category == u'\u5B50\u90e8':
category += u'(爾雅)'
except IndexError:
category = None
try:
text = re.findall(u'<span class="kszi">[\s\S]*?>(.*?)</p>',stext)[0].replace(u'\u3014',u'\uff08').replace(u'\u3015',u'\uff09') # 正文
text = re.sub(u'([^\u53C8|\u3011|\u898B])\u3010',u'\g<1>\n\n* \u3010',text) # 打上可爱的回车
text.strip('\n')
if not text.startswith('*'):
text = '* ' + text
except IndexError:
text = None
return (title, pinyin, cangjie, text, category)
def creatWikiPage(elements):
if not elements[0]:
return False
cmt = u'僕:立新典'
enter = '\n\n'
text = ''
if elements[1]:
text += u'* (漢語拼音)' + elements[1] + enter
if elements[2]:
text += u'* (倉頡碼)' + elements[2] + enter
if elements[3]:
text += elements[3] + enter + u'==據==\n* 《康熙字典》'
if elements[4]:
text += enter + '[[Category:' + elements[4] + '|{{subst:SUBPAGENAME}}]]'
wikipage = wikipedia.Page(wikipedia.getSite(), u'維基大典:維基爾雅/' + elements[0])
if not wikipage.exists():
wikipage.put(newtext=text, comment=cmt, minorEdit=False)
return True
def main():
after = False
for arg in wikipedia.handleArgs():
if arg.startswith('-start:'):
title = arg[7:]
filelist = getFileList('E:\www.zdic.net\zd\zi\*')
filelist = getFileList('E:\www.zdic.net\zd\zi2\*', filelist)
filelist = getFileList('E:\www.zdic.net\zd\zi3\*', filelist)
filelist = getFileList('E:\www.zdic.net\zd\zi5\*', filelist)
for filepath in filelist:
if not after:
elements = getElements(filepath, title)
if elements[0]:
after = True
creatWikiPage(elements)
else:
elements = getElements(filepath)
creatWikiPage(elements)
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()