diff --git a/apps/View/DbViewer.py b/apps/View/DbViewer.py index 2ad51fed..2216b9d3 100755 --- a/apps/View/DbViewer.py +++ b/apps/View/DbViewer.py @@ -19,17 +19,17 @@ def GET(self): #可以修改UrlEncoding,如果chardet自动检测的编码错误的话 action = web.input().get('action') if action == 'modurlenc': - id = int(web.input().get('id', 0)) + id_ = int(web.input().get('id', 0)) feedenc = web.input().get('feedenc') pageenc = web.input().get('pageenc') - urlenc = UrlEncoding.get_by_id(id) + urlenc = UrlEncoding.get_by_id(id_) if urlenc: if feedenc: urlenc.feedenc = feedenc if pageenc: urlenc.pageenc = pageenc urlenc.put() elif action == 'delurlenc': - id = int(web.input().get('id', 0)) - urlenc = UrlEncoding.get_by_id(id) + id_ = int(web.input().get('id', 0)) + urlenc = UrlEncoding.get_by_id(id_) if urlenc: urlenc.delete() return self.render('dbviewer.html', "DbViewer", diff --git a/apps/View/Deliver.py b/apps/View/Deliver.py index e41f2b88..0e4e6cd8 100755 --- a/apps/View/Deliver.py +++ b/apps/View/Deliver.py @@ -24,10 +24,10 @@ def queueit(self, usr, bookid, separate): param = {"u":usr.name, "id":bookid} if usr.merge_books and not separate: - self.queue2push[usr.name].append(str(bookid)) + self.queue2push[usr.name].append(str(bookid)) #合并推送 else: - taskqueue.add(url='/worker',queue_name="deliverqueue1",method='GET', - params=param,target="worker") + taskqueue.add(url='/worker', queue_name="deliverqueue1", method='GET', + params=param, target="worker") def flushqueue(self): for name in self.queue2push: @@ -38,17 +38,17 @@ def flushqueue(self): def GET(self): username = web.input().get('u') - id = web.input().get('id') #for debug + id_ = web.input().get('id') #for debug self.queue2push = defaultdict(list) books = Book.all() - if username: #现在投递,不判断时间和星期 + if username: #现在投递【测试使用】,不需要判断时间和星期 sent = [] - books2push = Book.get_by_id(int(id)) if id and id.isdigit() else None + books2push = Book.get_by_id(int(id_)) if id_ and id_.isdigit() else None books2push = [books2push] if books2push else books for book in books2push: - if not id and username not in book.users: + if not id_ and username not in book.users: continue user = KeUser.all().filter("name = ", username).get() if user and user.kindle_email: diff --git a/apps/View/Logs.py b/apps/View/Logs.py index 55a364d5..ad3cfba7 100755 --- a/apps/View/Logs.py +++ b/apps/View/Logs.py @@ -10,6 +10,11 @@ from apps.dbModels import * from apps.utils import etagged from google.appengine.api.datastore_errors import NeedIndexError +import web +try: + import json +except ImportError: + import simplejson as json class Mylogs(BaseHandler): __url__ = "/logs" @@ -20,6 +25,8 @@ def GET(self): mylogs = DeliverLog.all().filter("username = ", user.name).order('-time').fetch(limit=10) except NeedIndexError: #很多人不会部署,经常出现没有建立索引的情况,干脆碰到这种情况直接消耗CPU时间自己排序得了 mylogs = sorted(DeliverLog.all().filter("username = ", user.name), key=attrgetter('time'), reverse=True)[:10] + + #其他用户的推送记录 logs = {} if user.name == 'admin': for u in KeUser.all().filter("name != ", 'admin'): @@ -29,9 +36,26 @@ def GET(self): ul = sorted(DeliverLog.all().filter("username = ", user.name), key=attrgetter('time'), reverse=True)[:5] if ul: logs[u.name] = ul + + #管理员可以查看所有用户的已推送期号,其他用户只能查看自己的已推送期号 + if user.name == 'admin': + try: + lastDelivered = LastDelivered.all().order('-datetime').fetch(limit=100) + except NeedIndexError: + lastDelivered = sorted(LastDelivered.all().fetch(), key=attrgetter('datetime'), reverse=True)[:100] + else: + try: + lastDelivered = LastDelivered.all().filter('username = ', user.name).order('-datetime').fetch(limit=100) + except NeedIndexError: + lastDelivered = sorted(LastDelivered.all().filter('username = ', user.name), key=attrgetter('datetime'), reverse=True)[:100] + + if len(lastDelivered) == 0: + lastDelivered = None + return self.render('logs.html', "Deliver log", current='logs', - mylogs=mylogs, logs=logs) + mylogs=mylogs, logs=logs, lastDelivered=lastDelivered) +#每天自动运行的任务,清理过期log class RemoveLogs(BaseHandler): __url__ = "/removelogs" def GET(self): @@ -41,10 +65,59 @@ def GET(self): user.enable_send = False user.put() + #清理30天之前的推送记录 query = DeliverLog.all() - query.filter('datetime < ', datetime.datetime.utcnow() - datetime.timedelta(days=25)) + query.filter('datetime < ', datetime.datetime.utcnow() - datetime.timedelta(days=30)) logs = query.fetch(1000) c = len(logs) db.delete(logs) - return "%s lines log removed.
" % c \ No newline at end of file + #清理过期的已推送期号 + query = LastDelivered.all() + query.filter('datetime < ', datetime.datetime.utcnow() - datetime.timedelta(days=90)) + logs = query.fetch(1000) + db.delete(logs) + + return "%s lines delivery log removed.
" % c + +#修改/删除已推送期号的AJAX处理函数 +class LastDeliveredAjax(BaseHandler): + __url__ = "/lastdelivered/(.*)" + + def POST(self, mgrType): + web.header('Content-Type', 'application/json') + user = self.getcurrentuser() + + if mgrType.lower() == 'delete': + id_ = web.input().get('id_') + try: + id_ = int(id_) + except: + return json.dumps({'status': _('The id is invalid!')}) + + dbItem = LastDelivered.get_by_id(id_) + if dbItem: + dbItem.delete() + return json.dumps({'status':'ok'}) + else: + return json.dumps({'status': _('The LastDelivered item(%d) not exist!') % id_}) + elif mgrType.lower() == 'change': + id_ = web.input().get('id_') + num = web.input().get('num') + try: + id_ = int(id_) + num = int(num) + except: + return json.dumps({'status': _('The id or num is invalid!')}) + + dbItem = LastDelivered.get_by_id(id_) + if dbItem: + dbItem.num = num + dbItem.record = '' #手工修改了期号则清空文字描述 + dbItem.put() + return json.dumps({'status': 'ok', 'num': num}) + else: + return json.dumps({'status': _('The LastDelivered item(%d) not exist!') % id_}) + else: + return json.dumps({'status': 'unknown command: %s' % mgrType}) + \ No newline at end of file diff --git a/apps/View/Setting.py b/apps/View/Setting.py index aa5a8aed..35ebdc27 100755 --- a/apps/View/Setting.py +++ b/apps/View/Setting.py @@ -21,37 +21,39 @@ class Setting(BaseHandler): @etagged() def GET(self, tips=None): user = self.getcurrentuser() - return self.render('setting.html',"Setting", - current='setting',user=user,mail_sender=SRC_EMAIL,tips=tips) + return self.render('setting.html', "Setting", + current='setting', user=user, mail_sender=SRC_EMAIL, tips=tips) def POST(self): user = self.getcurrentuser() - kemail = web.input().get('kindleemail') - mytitle = web.input().get("rt") + webInput = web.input() + kemail = webInput.get('kindleemail') + mytitle = webInput.get("rt") if not kemail: tips = _("Kindle E-mail is requied!") elif not mytitle: tips = _("Title is requied!") else: user.kindle_email = kemail - user.timezone = int(web.input().get('timezone', TIMEZONE)) - user.send_time = int(web.input().get('sendtime')) - user.enable_send = bool(web.input().get('enablesend')) - user.book_type = web.input().get('booktype') - user.device = web.input().get('devicetype') or 'kindle' - user.use_title_in_feed = bool(web.input().get('titlefrom') == 'feed') - user.titlefmt = web.input().get('titlefmt') + user.timezone = int(webInput.get('timezone', TIMEZONE)) + user.send_time = int(webInput.get('sendtime')) + user.enable_send = bool(webInput.get('enablesend')) + user.book_type = webInput.get('booktype') + user.device = webInput.get('devicetype') or 'kindle' + user.use_title_in_feed = bool(webInput.get('titlefrom') == 'feed') + user.titlefmt = webInput.get('titlefmt') alldays = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'] - user.send_days = [day for day in alldays if web.input().get(day)] - user.merge_books = bool(web.input().get('mergebooks')) + user.send_days = [day for day in alldays if webInput.get(day)] + user.merge_books = bool(webInput.get('mergebooks')) + user.book_mode = webInput.get('bookmode') user.put() myfeeds = user.ownfeeds - myfeeds.language = web.input().get("lng") + myfeeds.language = webInput.get("lng") myfeeds.title = mytitle - myfeeds.keep_image = bool(web.input().get("keepimage")) - myfeeds.oldest_article = int(web.input().get('oldest', 7)) - myfeeds.users = [user.name] if web.input().get("enablerss") else [] + myfeeds.keep_image = bool(webInput.get("keepimage")) + myfeeds.oldest_article = int(webInput.get('oldest', 7)) + myfeeds.users = [user.name] if webInput.get("enablerss") else [] myfeeds.put() tips = _("Settings Saved!") diff --git a/apps/View/Share.py b/apps/View/Share.py index 65483d55..c6f3c403 100755 --- a/apps/View/Share.py +++ b/apps/View/Share.py @@ -59,7 +59,7 @@ def SaveToEvernoteWiz(self, user, action, orgUrl): main.log.warn('No have wiz mail yet.') return "No have wiz mail yet." - book = BaseUrlBook() + book = BaseUrlBook(user=user) book.title = book.description = action book.language = user.ownfeeds.language book.keep_image = user.ownfeeds.keep_image diff --git a/apps/View/Subscribe.py b/apps/View/Subscribe.py index cfda62b7..12933f7d 100755 --- a/apps/View/Subscribe.py +++ b/apps/View/Subscribe.py @@ -17,6 +17,8 @@ from apps.utils import etagged from apps.BaseHandler import BaseHandler from apps.dbModels import * +from books import BookClasses, BookClass +from books.base import BaseComicBook class MySubscription(BaseHandler): __url__ = "/my" @@ -44,6 +46,7 @@ def POST(self): # 添加自定义RSS memcache.delete('%d.feedscount'%user.ownfeeds.key().id()) raise web.seeother('/my') +#添加/删除自定义RSS订阅的AJAX处理函数 class FeedsAjax(BaseHandler): __url__ = "/feeds/(.*)" @@ -84,25 +87,28 @@ def POST(self, mgrType): respDict['feedid'] = fd.key().id() memcache.delete('%d.feedscount' % user.ownfeeds.key().id()) return json.dumps(respDict) + else: + return json.dumps({'status': 'unknown command: %s' % mgrType}) + +#订阅/退订内置书籍的AJAX处理函数 class BooksAjax(BaseHandler): __url__ = "/books/(.*)" def POST(self, mgrType): web.header('Content-Type', 'application/json') user = self.getcurrentuser() + id_ = web.input().get('id_') + try: + id_ = int(id_) + except: + return json.dumps({'status': _('The id is invalid!')}) - if mgrType.lower() == 'unsubscribe': - id_ = web.input().get('id_') - try: - id_ = int(id_) - except: - return json.dumps({'status': _('The id is invalid!')}) - - bk = Book.get_by_id(id_) - if not bk: - return json.dumps({'status': _('The book(%d) not exist!') % id_}) + bk = Book.get_by_id(id_) + if not bk: + return json.dumps({'status': _('The book(%d) not exist!') % id_}) + if mgrType.lower() == 'unsubscribe': if user.name in bk.users: bk.users.remove(user.name) bk.separate = False @@ -115,23 +121,21 @@ def POST(self, mgrType): return json.dumps({'status':'ok', 'title': bk.title, 'desc': bk.description}) elif mgrType.lower() == 'subscribe': - id_ = web.input().get('id_') separate = web.input().get('separate', '') respDict = {'status':'ok'} - try: - id_ = int(id_) - except: - return json.dumps({'status': _('The id is invalid')}) - - bk = Book.get_by_id(id_) - if not bk: + bkcls = BookClass(bk.title) + if not bkcls: return json.dumps({'status': 'The book(%d) not exist!' % id_}) + #如果是漫画类,则不管是否选择了“单独推送”,都自动变成“单独推送” + if issubclass(bkcls, BaseComicBook): + separate = 'true' + if user.name not in bk.users: bk.users.append(user.name) - bk.separate = bool(separate.lower() in ('true','1')) + bk.separate = bool(separate.lower() in ('true', '1')) bk.put() respDict['title'] = bk.title @@ -140,7 +144,9 @@ def POST(self, mgrType): respDict['subscription_info'] = bool(user.subscription_info(bk.title)) respDict['separate'] = bk.separate return json.dumps(respDict) - + else: + return json.dumps({'status': 'unknown command: %s' % mgrType}) + class Subscribe(BaseHandler): __url__ = "/subscribe/(.*)" def GET(self, id_): @@ -154,9 +160,19 @@ def GET(self, id_): if not bk: return "the book(%d) not exist!
" % id_ + bkcls = BookClass(bk.title) + if not bkcls: + return "the book(%d) not exist!
" % id_ + + #如果是漫画类,则不管是否选择了“单独推送”,都自动变成“单独推送” + if issubclass(bkcls, BaseComicBook): + separate = 'true' + else: + separate = web.input().get('separate', 'true') + if main.session.username not in bk.users: bk.users.append(main.session.username) - bk.separate = bool(web.input().get('separate') in ('true','1')) + bk.separate = bool(separate in ('true', '1')) bk.put() raise web.seeother('/my') diff --git a/apps/View/UpdateLogs.py b/apps/View/UpdateLogs.py deleted file mode 100755 index 144a63e5..00000000 --- a/apps/View/UpdateLogs.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf-8 -*- -#A GAE web application to aggregate rss and send it to your kindle. -#Visit https://github.com/cdhigh/KindleEar for the latest version -#Contributors: -# rexdf -import web - -from apps.BaseHandler import BaseHandler -from apps.dbModels import * -from apps.utils import etagged - -class Uplogs(BaseHandler): - __url__ = "/updatelogs" - @etagged() - def GET(self, tips=None): - uplogs = UpdateLog.all() - return self.render('updatelogs.html', "Update log", current='updatelogs', uplogs=uplogs, tips=tips) - - - def POST(self): - uplogs = UpdateLog.all() - for log in uplogs: - name = log.comicname - count = int(web.input().get(name.encode("utf"))) - if count == 0: - log.delete() - elif count != log.updatecount: - log.delete() - dl = UpdateLog(comicname=name, updatecount=count) - dl.put() - - newname = web.input().get("newname") - newcount = web.input().get("newcount") - if newname != "" and newcount != "": - dl = UpdateLog(comicname=newname, updatecount=int(newcount)) - dl.put() - - tips = _("Settings Saved!") - - return self.GET(tips) diff --git a/apps/Work/Url2Book.py b/apps/Work/Url2Book.py index 330aa319..e01989a8 100755 --- a/apps/Work/Url2Book.py +++ b/apps/Work/Url2Book.py @@ -29,7 +29,7 @@ def GET(self): keepimage = bool(web.input().get("keepimage") == '1') booktype = web.input().get("type", "mobi") tz = int(web.input().get("tz", TIMEZONE)) - if not all((username,urls,subject,to,language,booktype,tz)): + if not all((username, urls, subject, to, language, booktype, tz)): return "Some parameter missing!
" if (';' in to) or (',' in to): @@ -53,7 +53,7 @@ def GET(self): else: if not dlinfo: dlinfo = 'download failed' - self.deliverlog(username, str(to), filename, 0, status=dlinfo,tz=tz) + self.deliverlog(username, str(to), filename, 0, status=dlinfo, tz=tz) main.log.info("%s Sent!" % filename) return "%s Sent!" % filename elif booktype == 'Debug': #调试目的,将链接直接下载,发送到管理员邮箱 @@ -71,8 +71,10 @@ def GET(self): user = KeUser.all().filter("name = ", username).get() if not user or not user.kindle_email: return "User not exist!
" - - book = BaseUrlBook() + + opts = getOpts(user.device) + + book = BaseUrlBook(opts=opts, user=user) book.title = book.description = subject book.language = language book.keep_image = keepimage @@ -83,14 +85,13 @@ def GET(self): opts = oeb = None # 创建 OEB - opts = getOpts(user.device) oeb = CreateOeb(main.log, None, opts) oeb.container = ServerContainer(main.log) if len(book.feeds) > 1: setMetaData(oeb, subject, language, local_time(tz=tz)) - id, href = oeb.manifest.generate('masthead', DEFAULT_MASTHEAD) - oeb.manifest.add(id, href, MimeFromFilename(DEFAULT_MASTHEAD)) + id_, href = oeb.manifest.generate('masthead', DEFAULT_MASTHEAD) + oeb.manifest.add(id_, href, MimeFromFilename(DEFAULT_MASTHEAD)) oeb.guide.add('masthead', 'Masthead Image', href) else: setMetaData(oeb, subject, language, local_time(tz=tz), pubtype='book:book:KindleEar') @@ -105,10 +106,10 @@ def GET(self): itemcnt,hasimage = 0,False sections = {subject:[]} toc_thumbnails = {} #map img-url -> manifest-href - for sec_or_media, url, title, content, brief, thumbnail in book.Items(opts,user): + for sec_or_media, url, title, content, brief, thumbnail in book.Items(): if sec_or_media.startswith(r'image/'): - id, href = oeb.manifest.generate(id='img', href=title) - item = oeb.manifest.add(id, href, sec_or_media, data=content) + id_, href = oeb.manifest.generate(id='img', href=title) + item = oeb.manifest.add(id_, href, sec_or_media, data=content) if thumbnail: toc_thumbnails[url] = href itemcnt += 1 @@ -117,8 +118,8 @@ def GET(self): if len(book.feeds) > 1: sections[subject].append((title, brief, thumbnail, content)) else: - id, href = oeb.manifest.generate(id='page', href='page.html') - item = oeb.manifest.add(id, href, 'application/xhtml+xml', data=content) + id_, href = oeb.manifest.generate(id='page', href='page.html') + item = oeb.manifest.add(id_, href, 'application/xhtml+xml', data=content) oeb.spine.add(item, False) oeb.toc.add(title, href) @@ -126,7 +127,7 @@ def GET(self): if itemcnt > 0: if len(book.feeds) > 1: - InsertToc(oeb, sections, toc_thumbnails) + InsertToc(oeb, sections, toc_thumbnails, GENERATE_HTML_TOC, GENERATE_TOC_THUMBNAIL) # elif not hasimage: #单文章没有图片则去掉封面 # href = oeb.guide['cover'].href # oeb.guide.remove('cover') @@ -142,7 +143,7 @@ def GET(self): main.log.info(rs) return rs else: - self.deliverlog(username, str(to), book.title, 0, status='fetch failed',tz=tz) + self.deliverlog(username, str(to), book.title, 0, status='fetch failed', tz=tz) rs = "[Url2Book]Fetch url failed." main.log.info(rs) return rs diff --git a/apps/Work/Worker.py b/apps/Work/Worker.py index 4e26eaac..be66fb40 100755 --- a/apps/Work/Worker.py +++ b/apps/Work/Worker.py @@ -16,14 +16,14 @@ from collections import OrderedDict from apps.BaseHandler import BaseHandler from apps.dbModels import * -from apps.utils import InsertToc, local_time +from apps.utils import InsertToc, local_time, get_exc_location from lib.makeoeb import * from calibre.ebooks.conversion.mobioutput import MOBIOutput from calibre.ebooks.conversion.epuboutput import EPUBOutput from calibre.utils.bytestringio import byteStringIO from books import BookClasses, BookClass -from books.base import BaseFeedBook - +from books.base import BaseFeedBook, BaseComicBook + #实际下载文章和生成电子书并且发送邮件 class Worker(BaseHandler): __url__ = "/worker" @@ -39,7 +39,8 @@ def GET(self): if (';' in to) or (',' in to): to = to.replace(',', ';').replace(' ', '').split(';') - booktype = user.book_type + booktype = user.book_type #mobi,epub + bookmode = user.book_mode or 'periodical' #periodical,comic titlefmt = user.titlefmt tz = user.timezone @@ -60,6 +61,8 @@ def GET(self): book4meta = BookClass(bks[0].title) mhfile = book4meta.mastheadfile coverfile = book4meta.coverfile + if issubclass(book4meta, BaseComicBook): #如果单独推送一个继承自BaseComicBook的书籍,则自动设置为漫画模式 + bookmode = 'comic' else: #单独的推送自定义RSS book4meta = bks[0] mhfile = DEFAULT_MASTHEAD @@ -77,11 +80,16 @@ def GET(self): # 创建 OEB #global log - opts = getOpts(user.device) + opts = getOpts(user.device, bookmode) oeb = CreateOeb(main.log, None, opts) - title = "%s %s" % (book4meta.title, local_time(titlefmt, tz)) if titlefmt else book4meta.title + bookTitle = "%s %s" % (book4meta.title, local_time(titlefmt, tz)) if titlefmt else book4meta.title - setMetaData(oeb, title, book4meta.language, local_time("%Y-%m-%d",tz), 'KindleEar') + if bookmode == 'comic': + pubtype = 'book:book:KindleEar' + else: + pubtype = 'periodical:magazine:KindleEar' + + setMetaData(oeb, bookTitle, book4meta.language, local_time("%Y-%m-%d",tz), pubtype=pubtype) oeb.container = ServerContainer(main.log) #guide @@ -105,10 +113,10 @@ def GET(self): if imgType: #如果是合法图片 imgMime = r"image/" + imgType else: - main.log.warn('content of cover is invalid : [%s].' % title) + main.log.warn('content of cover is invalid : [%s].' % bookTitle) imgData = None except Exception as e: - main.log.warn('Failed to fetch cover for book [%s]. [Error: %s]' % (title, str(e))) + main.log.warn('Failed to fetch cover for book [%s]. [Error: %s]' % (bookTitle, str(e))) coverfile = DEFAULT_COVER imgData = None imgMime = '' @@ -129,16 +137,16 @@ def GET(self): oeb.guide.add('cover', 'Cover', href) oeb.metadata.add('cover', id_) - itemcnt,imgindex = 0,0 + itemcnt, imgindex = 0, 0 sections = OrderedDict() toc_thumbnails = {} #map img-url -> manifest-href for bk in bks: if bk.builtin: - book = BookClass(bk.title) - if not book: + cbook = BookClass(bk.title) + if not cbook: main.log.warn('not exist book <%s>' % bk.title) continue - book = book(imgindex=imgindex) + book = cbook(imgindex=imgindex, opts=opts, user=user) book.url_filters = [flt.url for flt in user.urlfilter] if bk.needs_subscription: #需要登录 subs_info = user.subscription_info(bk.title) @@ -148,7 +156,8 @@ def GET(self): else: # 自定义RSS if bk.feedscount == 0: continue #return "the book has no feed!
" - book = BaseFeedBook(imgindex=imgindex) + + book = BaseFeedBook(imgindex=imgindex, opts=opts, user=user) book.title = bk.title book.description = bk.description book.language = bk.language @@ -162,8 +171,8 @@ def GET(self): # 对于html文件,变量名字自文档,thumbnail为文章第一个img的url # 对于图片文件,section为图片mime,url为原始链接,title为文件名,content为二进制内容, # img的thumbail仅当其为article的第一个img为True - try: #书的质量可能不一,一本书的异常不能影响推送 - for sec_or_media, url, title, content, brief, thumbnail in book.Items(opts,user): + try: #书的质量可能不一,一本书的异常不能影响其他书籍的推送 + for sec_or_media, url, title, content, brief, thumbnail in book.Items(): if not sec_or_media or not title or not content: continue @@ -181,11 +190,26 @@ def GET(self): sections[sec_or_media].append((title, brief, thumbnail, content)) itemcnt += 1 except Exception as e: - main.log.warn("Failure in pushing book '%s' : %s" % (book.title, str(e))) + excFileName, excFuncName, excLineNo = get_exc_location() + main.log.warn("Failed to push <%s> : %s, in file '%s', %s (line %d)" % ( + book.title, str(e), excFileName, excFuncName, excLineNo)) continue - + + volumeTitle = '' if itemcnt > 0: - InsertToc(oeb, sections, toc_thumbnails) + #漫画模式不需要TOC和缩略图 + if bookmode == 'comic': + insertHtmlToc = False + insertThumbnail = False + if len(bks) == 1 and book: #因为漫画模式没有目录,所以在标题中添加卷号 + volumeTitle = book.LastDeliveredVolume() + oeb.metadata.clear('title') + oeb.metadata.add('title', bookTitle + volumeTitle) + else: + insertHtmlToc = GENERATE_HTML_TOC + insertThumbnail = GENERATE_TOC_THUMBNAIL + + InsertToc(oeb, sections, toc_thumbnails, insertHtmlToc, insertThumbnail) oIO = byteStringIO() o = EPUBOutput() if booktype == "epub" else MOBIOutput() o.convert(oeb, oIO, opts, main.log) @@ -198,12 +222,12 @@ def GET(self): diff = datetime.datetime.utcnow() - ultima_log.datetime if diff.days * 86400 + diff.seconds < 10: time.sleep(8) - self.SendToKindle(username, to, book4meta.title, booktype, str(oIO.getvalue()), tz) + self.SendToKindle(username, to, book4meta.title + volumeTitle, booktype, str(oIO.getvalue()), tz) rs = "%s(%s).%s Sent!"%(book4meta.title, local_time(tz=tz), booktype) main.log.info(rs) return rs else: - self.deliverlog(username, str(to), book4meta.title, 0, status='nonews',tz=tz) + self.deliverlog(username, str(to), book4meta.title + volumeTitle, 0, status='nonews', tz=tz) rs = "No new feeds." main.log.info(rs) return rs diff --git a/apps/__init__.py b/apps/__init__.py index 6eaac8f9..475f402f 100755 --- a/apps/__init__.py +++ b/apps/__init__.py @@ -11,7 +11,7 @@ import __builtin__, sys from google.appengine.ext import vendor -__Version__ = '1.24.2' +__Version__ = '1.25' __builtin__.__dict__['__Version__'] = __Version__ diff --git a/apps/dbModels.py b/apps/dbModels.py index 85a93e3a..fd1de303 100755 --- a/apps/dbModels.py +++ b/apps/dbModels.py @@ -13,6 +13,7 @@ from apps.utils import ke_encrypt,ke_decrypt #--------------db models---------------- +#对应到每一个”书“,注意,同一个用户的”自定义RSS“会归到同一本书内 class Book(db.Model): title = db.StringProperty(required=True) description = db.StringProperty() @@ -59,7 +60,7 @@ class KeUser(db.Model): # kindleEar User send_days = db.StringListProperty() send_time = db.IntegerProperty() timezone = db.IntegerProperty() - book_type = db.StringProperty() + book_type = db.StringProperty() #mobi,epub device = db.StringProperty() expires = db.DateTimeProperty() ownfeeds = db.ReferenceProperty(Book) # 每个用户都有自己的自定义RSS @@ -87,6 +88,8 @@ class KeUser(db.Model): # kindleEar User qrcode = db.BooleanProperty() #是否在文章末尾添加文章网址的QRCODE cover = db.BlobProperty() #保存各用户的自定义封面图片二进制内容 + book_mode = db.StringProperty() #added 2017-08-31 书籍模式,'periodical'|'comic',漫画模式可以直接全屏 + @property def whitelist(self): return WhiteList.all().filter('user = ', self.key()) @@ -95,17 +98,19 @@ def whitelist(self): def urlfilter(self): return UrlFilter.all().filter('user = ', self.key()) + #获取此账号对应的书籍的网站登陆信息 def subscription_info(self, title): - "获取此账号对应的书籍的网站登陆信息" return SubscriptionInfo.all().filter('user = ', self.key()).filter('title = ', title).get() - + +#自定义RSS订阅源 class Feed(db.Model): book = db.ReferenceProperty(Book) title = db.StringProperty() url = db.StringProperty() isfulltext = db.BooleanProperty() time = db.DateTimeProperty() #源被加入的时间,用于排序 - + +#书籍的推送历史记录 class DeliverLog(db.Model): username = db.StringProperty() to = db.StringProperty() @@ -115,10 +120,14 @@ class DeliverLog(db.Model): book = db.StringProperty() status = db.StringProperty() -class UpdateLog(db.Model): - comicname = db.StringProperty() - updatecount = db.IntegerProperty() - +#added 2017-09-01 记录已经推送的期数/章节等信息,可用来处理连载的漫画/小说等 +class LastDelivered(db.Model): + username = db.StringProperty() + bookname = db.StringProperty() + num = db.IntegerProperty(default=0) #num和record可以任选其一用来记录,或使用两个配合都可以 + record = db.StringProperty(default='') #record同时也用做在web上显示 + datetime = db.DateTimeProperty() + class WhiteList(db.Model): mail = db.StringProperty() user = db.ReferenceProperty(KeUser) diff --git a/apps/handlemail.py b/apps/handlemail.py index 994239b1..0baf8344 100644 --- a/apps/handlemail.py +++ b/apps/handlemail.py @@ -258,8 +258,8 @@ def receive(self, message): user.ownfeeds.language, local_time(tz=user.timezone), pubtype='book:book:KindleEar') oeb.container = ServerContainer(log) - id, href = oeb.manifest.generate(id='page', href='page.html') - item = oeb.manifest.add(id, href, 'application/xhtml+xml', data=unicode(soup)) + id_, href = oeb.manifest.generate(id='page', href='page.html') + item = oeb.manifest.add(id_, href, 'application/xhtml+xml', data=unicode(soup)) oeb.spine.add(item, False) oeb.toc.add(subject, href) @@ -272,8 +272,8 @@ def receive(self, message): except: pass else: - id, href = oeb.manifest.generate(id='img', href=filename) - item = oeb.manifest.add(id, href, mimetype, data=content) + id_, href = oeb.manifest.generate(id='img', href=filename) + item = oeb.manifest.add(id_, href, mimetype, data=content) oIO = byteStringIO() o = EPUBOutput() if user.book_type == "epub" else MOBIOutput() diff --git a/apps/utils.py b/apps/utils.py index b3db5bbc..e1ce0307 100755 --- a/apps/utils.py +++ b/apps/utils.py @@ -6,7 +6,7 @@ # cdhigh #Contributors: # rexdf - +import os, sys from functools import wraps from hashlib import md5 import web @@ -15,6 +15,20 @@ import gettext import re +#当异常出现时,使用此函数返回真实引发异常的文件名,函数名和行号 +def get_exc_location(): + #追踪到最终的异常引发点 + exc_info = sys.exc_info()[2] + last_exc = exc_info.tb_next + while (last_exc.tb_next): + last_exc = last_exc.tb_next + fileName = os.path.basename(last_exc.tb_frame.f_code.co_filename) + funcName = last_exc.tb_frame.f_code.co_name + lineNo = last_exc.tb_frame.f_lineno + last_exc = None + exc_info = None + return fileName, funcName, lineNo + def local_time(fmt="%Y-%m-%d %H:%M", tz=TIMEZONE): return (datetime.datetime.utcnow()+datetime.timedelta(hours=tz)).strftime(fmt) @@ -75,11 +89,10 @@ def wrapper(*args, **kwds): return wrapper return decorator -def InsertToc(oeb, sections, toc_thumbnails): - """ 创建OEB的两级目录,主要代码由rexdf贡献 - sections为有序字典,关键词为段名,元素为元组列表(title,brief,humbnail,content) - toc_thumbnails为字典,关键词为图片原始URL,元素为其在oeb内的href。 - """ +#创建OEB的两级目录,主要代码由rexdf贡献 +#sections为有序字典,关键词为段名,元素为元组列表(title,brief,humbnail,content) +#toc_thumbnails为字典,关键词为图片原始URL,元素为其在oeb内的href。 +def InsertToc(oeb, sections, toc_thumbnails, insertHtmlToc=True, insertThumbnail=True): css_pat = r'' css_ex = re.compile(css_pat, re.M | re.S) body_pat = r'(?<=).*?(?=)' @@ -135,20 +148,20 @@ def InsertToc(oeb, sections, toc_thumbnails): ncx_toc.append(('section', sec_with_num, href, '', sec_toc_thumbnail)) #Sections name && href && no brief #generate the secondary toc - if GENERATE_HTML_TOC: + if insertHtmlToc: html_toc_ = ['toc

%s

    ' % (sec_with_num)] for title, anchor, brief, thumbnail in secondary_toc_list: - if GENERATE_HTML_TOC: + if insertHtmlToc: html_toc_.append('    
  1. %s

  2. '%(href, anchor, title)) ncx_toc.append(('article',title, '%s#%d'%(href,anchor), brief, thumbnail)) # article name & article href && article brief - if GENERATE_HTML_TOC: + if insertHtmlToc: html_toc_.append('
') html_toc_2.append(html_toc_) name_section_list.append(sec_with_num) num_sections += 1 - if GENERATE_HTML_TOC: + if insertHtmlToc: #Generate HTML TOC for Calibre mostly ##html_toc_1 top level toc html_toc_1 = [u'Table Of Contents

%s

    '%(TABLE_OF_CONTENTS)] @@ -176,10 +189,17 @@ def InsertToc(oeb, sections, toc_thumbnails): toc = oeb.toc.add(unicode(oeb.metadata.title[0]), oeb.spine[0].href, id='periodical', klass='periodical', play_order=po) po += 1 for ncx in ncx_toc: + if insertThumbnail and ncx[4]: + toc_thumbnail = toc_thumbnails[ncx[4]] + else: + toc_thumbnail = None + if ncx[0] == 'section': - sectoc = toc.add(unicode(ncx[1]), ncx[2], klass='section', play_order=po, id='Main-section-%d'%po, toc_thumbnail=toc_thumbnails[ncx[4]] if GENERATE_TOC_THUMBNAIL and ncx[4] else None) + sectoc = toc.add(unicode(ncx[1]), ncx[2], klass='section', play_order=po, id='Main-section-%d'%po, + toc_thumbnail=toc_thumbnail) elif sectoc: - sectoc.add(unicode(ncx[1]), ncx[2], description=ncx[3] if ncx[3] else None, klass='article', play_order=po, id='article-%d'%po, toc_thumbnail=toc_thumbnails[ncx[4]] if GENERATE_TOC_THUMBNAIL and ncx[4] else None) + sectoc.add(unicode(ncx[1]), ncx[2], description=ncx[3] if ncx[3] else None, klass='article', play_order=po, + id='article-%d'%po, toc_thumbnail=toc_thumbnail) po += 1 #-----------以下几个函数为安全相关的 diff --git a/books/Readme.txt b/books/Readme.txt index 0363a6e1..fc4cec00 100644 --- a/books/Readme.txt +++ b/books/Readme.txt @@ -1,22 +1,22 @@ -1. 概述 +1. 概述 此应用根目录下的books目录存放自定义RSS设置,每个文件为一本"书",对应推送到kindle的一本书。 应用启动后会自动读取此目录下的所有py文件,动态导入,并显示在网页“我的订阅”下,可以选择是否推送。 books目录下的文件除了__init__.py和base.py,其他的文件都可以随意删除,如果你不需要的话。 在books目录下删除的“书籍”会在一天内从数据库中清除。 2. py文件格式 - ★py文件建议为UTF-8格式,特别是里面有中文的话。 + * py文件建议为UTF-8格式,特别是里面有中文的话。 所以每个py文件的头一行建议为: # -*- coding:utf-8 -*- 或者: #!/usr/bin/env python # -*- coding:utf-8 -*- - ★每个py文件都要实现一个函数getBook(),返回书籍实际定义的"类"对象: + * 每个py文件都要实现一个函数getBook(),返回书籍实际定义的"类"对象: def getBook(): return Qiushibaike - ★每本书为一个类(类名最好不要和文件名完全一样),必须实现的接口只有一个: + * 每本书为一个类(类名最好不要和文件名完全一样),必须实现的接口只有一个: Items(self, opts=None) 它是一个生成器或者返回一个迭代器。 每次返回一个元组: @@ -24,19 +24,18 @@ 图片元组:(图片MIME, URL, 图片文件名, 图片内容,None) -图片内容为字节串 其中图片MIME为:image/jpeg, image/gif 等 - ★上面已经说完了书籍定义的一切,所以如果你精通python,就可以自己写自己的书籍类了。 + * 上面已经说完了书籍定义的一切,所以如果你精通python,就可以自己写自己的书籍类了。 - ★不过如果你偷懒,也可以继承base模块中定义的两个书籍模板之一来定制自己的书籍类。 + * 不过如果你偷懒,也可以继承base模块中定义的两个书籍模板之一来定制自己的书籍类。 下一节介绍如何定制。 3. 书籍类定制方法 写过或看过calibre的recipe的基本上就直接会了。 因为calibre的recipe模块依赖挺多的,我时间不够,偷懒了,就不移植了,直接根据 recipe的外形写了一个处理模块。 - ★根据RSS类型,从base模块中导入不同的书籍基类 - from base import BaseFeedBook/WebpageBook - 如果你感兴趣的网站不提供RSS订阅,则可以继承WebpageBook直接连接网页提取信息。 - ★子类能定制的参数都在BaseFeedBook类的定义中,注释很详细。 - ★处理HTML的BeautifulSoup为4.x版本。 - - 在此贴子里有更详细的说明:http://www.hi-pda.com/forum/viewthread.php?tid=1248204 + * 根据RSS类型,从base模块中导入不同的书籍基类 + from base import BaseFeedBook/WebpageBook/BaseComicBook + * 如果你感兴趣的网站不提供RSS订阅,则可以继承WebpageBook直接连接网页提取信息。 + * 子类能定制的参数都在BaseFeedBook类的定义中,注释很详细。 + * 处理HTML的BeautifulSoup为4.x版本。 + * cartoonmadbase.py提供了抓取漫画图片的例子。 diff --git a/books/Xiaodaonews.py b/books/Xiaodaonews.py deleted file mode 100644 index b519e799..00000000 --- a/books/Xiaodaonews.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf-8 -*- -from weixinbase import WeixinBook - -def getBook(): - return Xiaodaonews - -class Xiaodaonews(WeixinBook): - title = u'微信公众号:小道消息' - description = u'只有小道消息才能拯救中国互联网' - language = 'zh-cn' - feed_encoding = "utf-8" - page_encoding = "utf-8" - oldest_article = 7 - deliver_days = ['Friday'] - feeds = [ - (u'小道消息', 'http://weixin.sogou.com/gzh?openid=oIWsFt86NKeSGd_BQKp1GcDkYpv0'), - ] diff --git a/books/__init__.py b/books/__init__.py index fdb12449..9e1a7a15 100644 --- a/books/__init__.py +++ b/books/__init__.py @@ -1,6 +1,9 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- - +#KindleEar +#Author: cdhigh +#自动加载books目录和子目录下的所有书籍文件,所有的自定义基类(不是最终的书籍实现)请以base.py结尾,比如xxxxbase.py +#各子目录下必须要有一个__init__.py文件,否则不会导入对应子目录下的书籍 import os _booksclasses = [] @@ -17,16 +20,21 @@ def BookClass(title): return bk return None -#def LoadBooks(): -for bkfile in os.listdir(os.path.dirname(__file__)): - if bkfile.endswith('.py') and not bkfile.startswith('__') and not bkfile.endswith("base.py"): - bookname = os.path.splitext(bkfile)[0] - try: - mbook = __import__("books." + bookname, fromlist='*') - bk = mbook.getBook() - #globals()[bk.__name__] = getattr(bk, bk.__name__) - RegisterBook(bk) - except Exception as e: - default_log.warn("Book '%s' import failed : %s" % (bookname,e)) - -#LoadBooks() +bookRootDir = os.path.dirname(__file__) +listBkDirs = os.walk(bookRootDir) +for root, dirs, files in listBkDirs: + for f in files: + bkFile = os.path.join(root, f) + baseName = os.path.basename(bkFile) + initFileName = os.path.join(os.path.dirname(bkFile), '__init__.py') #保证对应子目录下有__init__.py + if bkFile.endswith('.py') and not baseName.startswith('__') and not bkFile.endswith("base.py") and os.path.isfile(initFileName): + fullName = bkFile.replace(bookRootDir, '') + fullName = fullName.lstrip('/').lstrip('\\').replace('\\', '/') + bookModuleName = os.path.splitext(fullName)[0].replace('/', '.') + try: + mBook = __import__('books.' + bookModuleName, fromlist='*') + if hasattr(mBook, 'getBook'): + bk = mBook.getBook() + RegisterBook(bk) + except Exception as e: + default_log.warn("Book '%s' import failed : %s" % (bookModuleName, e)) diff --git a/books/base.py b/books/base.py index ab0dc37f..0092a85e 100644 --- a/books/base.py +++ b/books/base.py @@ -19,7 +19,7 @@ from StringIO import StringIO from config import * -from apps.dbModels import UpdateLog + #base class of Book class BaseFeedBook: title = '' @@ -168,20 +168,23 @@ def postprocess(self, content): #------------------------------------------------------------ # 下面的内容为类实现细节 #------------------------------------------------------------ - def __init__(self, log=None, imgindex=0): + def __init__(self, log=None, imgindex=0, opts=None, user=None): self.log = default_log if log is None else log self.compiled_urlfilters = [] self._imgindex = imgindex - + self.opts = opts + self.user = user + self.last_delivered_volume = '' #如果需要在推送书籍的标题中提供当前期号之类的信息,可以使用此属性 + @property def timeout(self): return self.network_timeout if self.network_timeout else CONNECTION_TIMEOUT - + @property def imgindex(self): self._imgindex += 1 return self._imgindex - + def isfiltered(self, url): if not self.url_filters: return False @@ -194,11 +197,19 @@ def isfiltered(self, url): if flt.match(url): return True return False - + + #返回当前任务的用户名 + def UserName(self): + return self.user.name if self.user else 'admin' + + #返回最近推送到期号(如果信息可用的话) + def LastDeliveredVolume(self): + return self.last_delivered_volume + @classmethod def urljoin(self, base, url): #urlparse.urljoin()处理有..的链接有点问题,此函数修正此问题。 - join = urlparse.urljoin(base,url) + join = urlparse.urljoin(base, url) url = urlparse.urlsplit(join) path = os.path.normpath(url.path) if IsRunInLocal: #假定调试环境为windows @@ -253,14 +264,9 @@ def ParseFeedUrls(self): result = opener.open(url) if result.status_code == 200 and result.content: #debug_mail(result.content, 'feed.xml') + decoder = AutoDecoder(isfeed=True) + content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) - if self.feed_encoding: - try: - content = result.content.decode(self.feed_encoding) - except UnicodeDecodeError: - content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) - else: - content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) feed = feedparser.parse(content) for e in feed['entries'][:self.max_articles_per_feed]: @@ -316,14 +322,18 @@ def ParseFeedUrls(self): continue else: self.log.warn('Fulltext feed item no has desc,link to webpage for article.(%s)' % title) - urls.append((section, title, urlfeed, desc)) + urladded.add(urlfeed) + #针对URL里面有unicode字符的处理,否则会出现Bad request + #后面参数里面的那一堆“乱码”是要求不处理ASCII的特殊符号,只处理非ASCII字符 + urlfeed = urllib.quote_plus(urlfeed.encode('utf-8'), r'''~`!@#$%^&*()|\\/,.<>;:"'{}[]?=-_+''') + urls.append((section, title, urlfeed, desc)) else: self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url)) return urls - def Items(self, opts=None, user=None): + def Items(self): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief,thumbnail @@ -333,8 +343,8 @@ def Items(self, opts=None, user=None): readability = self.readability if self.fulltext_by_readability else self.readability_by_soup prevsection = '' opener = URLOpener(self.host, timeout=self.timeout, headers=self.extra_header) - decoder = AutoDecoder(False) - for section, ftitle, url, desc in urls: + decoder = AutoDecoder(isfeed=False) + for section, fTitle, url, desc in urls: if not desc: #非全文RSS if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新检测编码 @@ -351,17 +361,17 @@ def Items(self, opts=None, user=None): if not article: continue else: - article = self.FragToXhtml(desc, ftitle) + article = self.FragToXhtml(desc, fTitle) #如果是图片,title则是mime - for title, imgurl, imgfn, content, brief, thumbnail in readability(article,url,opts,user): + for title, imgurl, imgfn, content, brief, thumbnail in readability(article, url): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief, thumbnail) else: - if user and user.use_title_in_feed: - title = ftitle + if self.user and self.user.use_title_in_feed: + title = fTitle elif not title: - title = ftitle + title = fTitle content = self.postprocess(content) yield (section, url, title, content, brief, thumbnail) @@ -480,24 +490,33 @@ def fetch(self, url, opener, decoder): """链接网络,下载网页并解码""" result = opener.open(url) status_code, content = result.status_code, result.content - if status_code not in (200,206) or not content: + if status_code not in (200, 206) or not content: self.log.warn('fetch page failed(%s):%s.' % (URLOpener.CodeMap(status_code), url)) return None #debug_mail(content) + return self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) - if self.page_encoding: + #自动解码,返回解码后的网页 + #content: 要解码的网页 + #decoder: AutoDecoder实例 + #defaultEncoding: 默认的编码 + #url: 网页的原始url地址(注意可能和之前opener使用的url不同,因为有可能发生了重定向,所以建议使用opener.realurl属性) + #headers: 网页返回的http响应头 + def AutoDecodeContent(self, content, decoder, defaultEncoding=None, url=None, headers=None): + if defaultEncoding: try: - return content.decode(self.page_encoding) + return content.decode(defaultEncoding) except UnicodeDecodeError: - return decoder.decode(content,opener.realurl,result.headers) + return decoder.decode(content, url, headers) else: - return decoder.decode(content,opener.realurl,result.headers) - - def readability(self, article, url, opts=None, user=None): + return decoder.decode(content, url, headers) + + def readability(self, article, url): """ 使用readability-lxml处理全文信息 因为图片文件占内存,为了节省内存,这个函数也做为生成器 """ + user = self.user content = self.preprocess(article) if not content: return @@ -513,8 +532,8 @@ def readability(self, article, url, opts=None, user=None): imgmime = r"image/" + imgtype fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype) yield (imgmime, url, fnimg, content, None, None) - tmphtml = 'Picture' % fnimg - yield ('Picture', None, None, tmphtml, '', None) + tmpHtml = 'Picture' % fnimg + yield ('Picture', None, None, tmpHtml, '', None) else: self.log.warn('article is invalid.[%s]' % url) return @@ -576,8 +595,8 @@ def readability(self, article, url, opts=None, user=None): if self.remove_tags: for tag in soup.find_all(self.remove_tags): tag.decompose() - for id in self.remove_ids: - for tag in soup.find_all(attrs={"id":id}): + for id_ in self.remove_ids: + for tag in soup.find_all(attrs={"id":id_}): tag.decompose() for cls in self.remove_classes: for tag in soup.find_all(attrs={"class":cls}): @@ -604,31 +623,15 @@ def readability(self, article, url, opts=None, user=None): thumbnail = None if self.keep_image: + self.RectifyImageSrcInSoup(soup, url) opener = URLOpener(self.host, timeout=self.timeout, headers=self.extra_header) for img in soup.find_all('img'): - #现在使用延迟加载图片技术的网站越来越多了,这里处理一下 - #注意:如果data-src之类的属性保存的不是真实url就没辙了 - imgurl = img['src'] if 'src' in img.attrs else '' + imgurl = img['src'] if 'src' in img.attrs else None if not imgurl: - for attr in img.attrs: - if attr != 'src' and 'src' in attr: #很多网站使用data-src - imgurl = img[attr] - break - if not imgurl: - img.decompose() continue - if not imgurl.startswith('data:'): - if not imgurl.startswith('http'): - imgurl = self.urljoin(url, imgurl) - if self.fetch_img_via_ssl and url.startswith('https://'): - imgurl = imgurl.replace('http://', 'https://') - if self.isfiltered(imgurl): - self.log.warn('img filtered : %s' % imgurl) - img.decompose() - continue - + imgresult = opener.open(imgurl) - imgcontent = self.process_image(imgresult.content, opts) if imgresult.status_code == 200 else None + imgcontent = self.process_image(imgresult.content) if imgresult.status_code == 200 else None if imgcontent: if isinstance(imgcontent, list): #一个图片分隔为多个图片 imgIndex = self.imgindex @@ -692,16 +695,15 @@ def readability(self, article, url, opts=None, user=None): self.soupprocessex(soup) #插入分享链接,如果有插入qrcode,则返回(imgName, imgContent) - if user: - qrimg = self.AppendShareLinksToArticle(soup, user, url) - if qrimg: - yield ('image/jpeg', url, qrimg[0], qrimg[1], None, None) + qrimg = self.AppendShareLinksToArticle(soup, url) + if qrimg: + yield ('image/jpeg', url, qrimg[0], qrimg[1], None, None) content = unicode(soup) - #提取文章内容的前面一部分做为摘要 + #提取文章内容的前面一部分做为摘要,[漫画模式不需要摘要] brief = u'' - if GENERATE_TOC_DESC: + if GENERATE_TOC_DESC and ((not user) or user.book_mode != 'comic'): for h in body.find_all(['h1','h2']): # 去掉h1/h2,避免和标题重复 h.decompose() for s in body.stripped_strings: @@ -713,10 +715,11 @@ def readability(self, article, url, opts=None, user=None): yield (title, None, None, content, brief, thumbnail) - def readability_by_soup(self, article, url, opts=None, user=None): + def readability_by_soup(self, article, url): """ 使用BeautifulSoup手动解析网页,提取正文内容 因为图片文件占内存,为了节省内存,这个函数也做为生成器 """ + user = self.user content = self.preprocess(article) soup = BeautifulSoup(content, "lxml") @@ -761,8 +764,8 @@ def readability_by_soup(self, article, url, opts=None, user=None): for tag in soup.find_all(remove_tags): tag.decompose() - for id in remove_ids: - for tag in soup.find_all(attrs={"id":id}): + for id_ in remove_ids: + for tag in soup.find_all(attrs={"id":id_}): tag.decompose() for cls in remove_classes: for tag in soup.find_all(attrs={"class":cls}): @@ -784,31 +787,15 @@ def readability_by_soup(self, article, url, opts=None, user=None): thumbnail = None if self.keep_image: + self.RectifyImageSrcInSoup(soup, url) opener = URLOpener(self.host, timeout=self.timeout, headers=self.extra_header) for img in soup.find_all('img'): - #现在使用延迟加载图片技术的网站越来越多了,这里处理一下 - #注意:如果data-src之类的属性保存的不是真实url就没辙了 - imgurl = img['src'] if 'src' in img.attrs else '' - if not imgurl: - for attr in img.attrs: - if attr != 'src' and 'src' in attr: #很多网站使用data-src - imgurl = img[attr] - break + imgurl = img['src'] if 'src' in img.attrs else None if not imgurl: - img.decompose() continue - if not imgurl.startswith('data:'): - if not imgurl.startswith('http'): - imgurl = self.urljoin(url, imgurl) - if self.fetch_img_via_ssl and url.startswith('https://'): - imgurl = imgurl.replace('http://', 'https://') - if self.isfiltered(imgurl): - self.log.warn('img filtered:%s' % imgurl) - img.decompose() - continue imgresult = opener.open(imgurl) - imgcontent = self.process_image(imgresult.content, opts) if imgresult.status_code == 200 else None + imgcontent = self.process_image(imgresult.content) if imgresult.status_code == 200 else None if imgcontent: if isinstance(imgcontent, list): #一个图片分隔为多个图片 imgIndex = self.imgindex @@ -894,16 +881,15 @@ def readability_by_soup(self, article, url, opts=None, user=None): self.soupprocessex(soup) #插入分享链接,如果插入了qrcode,则返回(imgName, imgContent) - if user: - qrimg = self.AppendShareLinksToArticle(soup, user, url) - if qrimg: - yield ('image/jpeg', url, qrimg[0], qrimg[1], None, None) + qrimg = self.AppendShareLinksToArticle(soup, url) + if qrimg: + yield ('image/jpeg', url, qrimg[0], qrimg[1], None, None) content = unicode(soup) - #提取文章内容的前面一部分做为摘要 + #提取文章内容的前面一部分做为摘要,[漫画模式不需要摘要] brief = u'' - if GENERATE_TOC_DESC: + if GENERATE_TOC_DESC and ((not user) or user.book_mode != 'comic'): for h in body.find_all(['h1','h2']): # 去掉h1/h2,避免和标题重复 h.decompose() for s in body.stripped_strings: @@ -915,11 +901,47 @@ def readability_by_soup(self, article, url, opts=None, user=None): yield (title, None, None, content, brief, thumbnail) + #如果需要,纠正或规则化soup里面的图片地址,比如延迟加载等 + def RectifyImageSrcInSoup(self, soup, url=None): + for img in soup.find_all('img'): + #现在使用延迟加载图片技术的网站越来越多了,这里处理一下 + #注意:如果data-src|data-original之类的属性保存的不是真实url就没辙了 + imgUrl = img['src'] if 'src' in img.attrs else '' + if not imgUrl: + for attr in img.attrs: + if attr != 'src' and (('src' in attr) or (attr == 'data-original')): #很多网站使用data-src|data-original + imgUrl = img[attr] + break + if not imgUrl: + for attr in img.attrs: + if attr != 'src' and 'data' in attr: #如果上面的搜索找不到,再大胆一点猜测url + imgUrl = img[attr] + break + + if not imgUrl: + img.decompose() + continue + + if url and not imgUrl.startswith(('data:', 'http')): + imgUrl = self.urljoin(url, imgUrl) + + if url and self.fetch_img_via_ssl and url.startswith('https://'): + imgUrl = imgUrl.replace('http://', 'https://') + + if self.isfiltered(imgUrl): + self.log.warn('img filtered : %s' % imgUrl) + img.decompose() + continue + + img['src'] = imgUrl #将更正的地址写回保存 + + #根据一些配置,对图像进行处理,比如缩小,转灰度图,转格式,图像分隔等 - def process_image(self, data, opts): + def process_image(self, data): if not data: return - + + opts = self.opts try: if not opts or not opts.process_images or not opts.process_images_immediately: return data @@ -927,7 +949,7 @@ def process_image(self, data, opts): return mobify_image(data) else: #如果图被拆分,则返回一个图像列表,否则返回None - splitedImages = self.SplitLongImage(data, opts) if THRESHOLD_SPLIT_LONG_IMAGE else None + splitedImages = self.SplitLongImage(data) if THRESHOLD_SPLIT_LONG_IMAGE else None if splitedImages: images = [] for image in splitedImages: @@ -943,7 +965,7 @@ def process_image(self, data, opts): return data #如果一个图片太长,则将其分隔成多个图片 - def SplitLongImage(self, data, opts): + def SplitLongImage(self, data): if not THRESHOLD_SPLIT_LONG_IMAGE: return None @@ -955,7 +977,7 @@ def SplitLongImage(self, data, opts): #info = img.info #高比宽至少大一倍才认为是长图 - if height < THRESHOLD_SPLIT_LONG_IMAGE or height < width * 2: + if height < THRESHOLD_SPLIT_LONG_IMAGE or height < width * 3: return None imagesData = [] @@ -977,9 +999,11 @@ def SplitLongImage(self, data, opts): return imagesData #在文章末尾添加分享链接,如果文章末尾添加了网址的QRCODE,则此函数返回生成的图像(imgName, imgContent),否则返回None - def AppendShareLinksToArticle(self, soup, user, url): + def AppendShareLinksToArticle(self, soup, url): + user = self.user if not user or not soup: return None + FirstLink = True qrimg = None qrimgName = '' @@ -1074,9 +1098,9 @@ def AppendShareLinksToArticle(self, soup, user, url): img.save(qrimg, 'JPEG') return (qrimgName, qrimg.getvalue()) if qrimg else None - + + #生成保存内容或分享文章链接的KindleEar调用链接 def MakeShareLink(self, sharetype, user, url, soup): - " 生成保存内容或分享文章链接的KindleEar调用链接 " if sharetype in ('evernote', 'wiz'): href = "%s/share?act=%s&u=%s&url=" % (DOMAIN, sharetype, user.name) elif sharetype == 'pocket': @@ -1112,14 +1136,14 @@ class WebpageBook(BaseFeedBook): fulltext_by_readability = False # 直接在网页中获取信息 - def Items(self, opts=None, user=None): + def Items(self): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief,thumbnail 对于图片,mime,url,filename,content,brief,thumbnail 如果是图片,仅第一个图片的thumbnail返回True,其余为None """ - decoder = AutoDecoder(False) + decoder = AutoDecoder(isfeed=False) timeout = self.timeout for section, url in self.feeds: opener = URLOpener(self.host, timeout=timeout, headers=self.extra_header) @@ -1129,14 +1153,8 @@ def Items(self, opts=None, user=None): self.log.warn('fetch article failed(%s):%s.' % (URLOpener.CodeMap(status_code), url)) continue - if self.page_encoding: - try: - content = content.decode(self.page_encoding) - except UnicodeDecodeError: - content = decoder.decode(content,opener.realurl,result.headers) - else: - content = decoder.decode(content,opener.realurl,result.headers) - + content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) + content = self.preprocess(content) soup = BeautifulSoup(content, "lxml") @@ -1186,8 +1204,8 @@ def Items(self, opts=None, user=None): remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.find_all(remove_tags): tag.decompose() - for id in remove_ids: - for tag in soup.find_all(attrs={"id":id}): + for id_ in remove_ids: + for tag in soup.find_all(attrs={"id":id_}): tag.decompose() for cls in remove_classes: for tag in soup.find_all(attrs={"class":cls}): @@ -1213,29 +1231,14 @@ def Items(self, opts=None, user=None): thumbnail = None if self.keep_image: self.soupbeforeimage(soup) + self.RectifyImageSrcInSoup(soup, url) for img in soup.find_all('img'): - #现在使用延迟加载图片技术的网站越来越多了,这里处理一下 - #注意:如果data-src之类的属性保存的不是真实url就没辙了 - imgurl = img['src'] if 'src' in img.attrs else '' - if not imgurl: - for attr in img.attrs: - if attr != 'src' and 'src' in attr: #很多网站使用data-src - imgurl = img[attr] - break + imgurl = img['src'] if 'src' in img.attrs else None if not imgurl: - img.decompose() continue - if not imgurl.startswith('data:'): - if not imgurl.startswith('http'): - imgurl = self.urljoin(url, imgurl) - if self.fetch_img_via_ssl and url.startswith('https://'): - imgurl = imgurl.replace('http://', 'https://') - if self.isfiltered(imgurl): - self.log.warn('img filtered:%s' % imgurl) - img.decompose() - continue + imgresult = opener.open(imgurl) - imgcontent = self.process_image(imgresult.content,opts) if imgresult.status_code==200 else None + imgcontent = self.process_image(imgresult.content) if imgresult.status_code==200 else None if imgcontent: if isinstance(imgcontent, list): #一个图片分隔为多个图片 imgIndex = self.imgindex @@ -1295,9 +1298,9 @@ def Items(self, opts=None, user=None): self.soupprocessex(soup) content = unicode(soup) - #提取文章内容的前面一部分做为摘要 + #提取文章内容的前面一部分做为摘要,[漫画模式不需要摘要] brief = u'' - if GENERATE_TOC_DESC: + if GENERATE_TOC_DESC and ((not self.user) or self.user.book_mode != 'comic'): for h in body.find_all(['h1','h2']): # 去掉h1/h2,避免和标题重复 h.decompose() for s in body.stripped_strings: @@ -1322,7 +1325,9 @@ def ParseFeedUrls(self): return [(sec,sec,url,'') for sec, url in self.feeds] class BaseComicBook(BaseFeedBook): - """ 漫画专用 + """ 漫画专用,漫画的主要特征是全部为图片,而且图片默认全屏呈现 + 由 insert0003 贡献代码 + 如果要处理连载的话,可以使用 ComicUpdateLog 数据库表来记录和更新 """ title = u'' description = u'' @@ -1331,117 +1336,123 @@ class BaseComicBook(BaseFeedBook): page_encoding = '' mastheadfile = '' coverfile = '' - mainurl = '' - - def Items(self, opts=None, user=None): - """ - 生成器,返回一个图片元组,mime,url,filename,content,brief,thumbnail - """ + feeds = [] #子类填充此列表[('name', mainurl),...] + min_image_size = (150, 150) #小于这个尺寸的图片会被删除,用于去除广告图片或按钮图片之类的 + + #子类必须实现此函数,返回 [(section, title, url, desc),..] + #每个URL直接为图片地址,或包含一个或几个漫画图片的网页地址 + def ParseFeedUrls(self): + return [] + + #生成器,返回一个图片元组,mime,url,filename,content,brief,thumbnail + def Items(self): urls = self.ParseFeedUrls() opener = URLOpener(self.host, timeout=self.timeout, headers=self.extra_header) - imgs = [] - for section, ftitle, url, desc in urls: - opener = URLOpener(self.host, timeout=self.timeout, headers=self.extra_header) + decoder = AutoDecoder(isfeed=False) + prevSection = '' + min_width, min_height = self.min_image_size if self.min_image_size else (0, 0) + htmlTemplate = '%s' + + for section, fTitle, url, desc in urls: + if section != prevSection or prevSection == '': + decoder.encoding = '' #每个小节都重新检测编码[当然是在抓取的是网页的情况下才需要] + prevSection = section + opener = URLOpener(self.host, timeout=self.timeout, headers=self.extra_header) + if self.needs_subscription: + result = self.login(opener, decoder) + result = opener.open(url) - article = result.content - if not article: + content = result.content + if not content: continue - - imgtype = imghdr.what(None, article) - imgmime = r"image/" + imgtype - fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype) - imgs.append(fnimg) - yield (imgmime, url, fnimg, article, None, None) - - if len(imgs)> 0: - tmphtml = 'Picture' - yield (self.title, url, ftitle, tmphtml, '', None) - - def updatelog(self, name, count): - try: - mylogs = UpdateLog.all().filter("comicname = ", name) - for log in mylogs: - log.delete() - dl = UpdateLog(comicname=name, updatecount=count) - dl.put() - except Exception as e: - default_log.warn("Updatelog failed to save: %s" % str(e)) - return None - - def GetNewComic(self): - href = "" - - if (self.title == "") or (self.mainurl == "") : - return href - - mhlog = UpdateLog.all().filter("comicname = ", self.title).get() - if mhlog is None: - default_log.warn("These is no log in db, set to 1") - oldNum = 1 - else: - oldNum = mhlog.updatecount - - opener = URLOpener(self.host, timeout=60) - result = opener.open(self.mainurl) - if result.status_code != 200: - self.log.warn('fetch rss failed:%s' % self.mainurl) - return href - - content = result.content.decode(self.feed_encoding, 'ignore') - soup = BeautifulSoup(content, "lxml") - - mhs = soup.findAll("table", {"width": '688'}) - for mh in mhs: - comics = mh.findAll("a", {"target": '_blank'}) - for comic in comics: - num = int(comic.text.split(" ")[1]) - if num > oldNum : - oldNum = num - href = "http://www.cartoonmad.com" + comic.get("href") - - if href != "" : - self.updatelog(self.title, oldNum) - - return href - - def GetComicUrls(self, href): - urls = [] - - comic_opener = URLOpener(self.host, timeout=60) - comic_page = comic_opener.open(href) - if comic_page.status_code != 200: - self.log.warn('fetch rss failed:%s' % href) - return [] - - comic_content = comic_page.content.decode(self.feed_encoding, 'ignore') - comic_body = BeautifulSoup(comic_content, "lxml") - ul = comic_body.find("select").findAll("option") - if ul is None : - return[] - else: - for mh in ul: - mhhref = mh.get("value") - if mhhref: - pagehref = "http://www.cartoonmad.com/comic/" + mhhref - pageopener = URLOpener(self.host, timeout=60) - pageresult = pageopener.open(pagehref) - if pageresult.status_code != 200: - self.log.warn('fetch rss failed:%s' % pagehref) - return [] - body = pageresult.content.decode(self.feed_encoding, 'ignore') - sp = BeautifulSoup(body, "lxml") - mhpic = sp.find("img", {"oncontextmenu": 'return false'}).get("src") - urls.append( (self.title, mh.text, mhpic, None)) - return urls - - def ParseFeedUrls(self): - href = self.GetNewComic() - if href == "": - return [] - - return self.GetComicUrls(href) - - + + imgFilenameList = [] + + #先判断是否是图片 + imgType = imghdr.what(None, content) + if imgType: + imgMime = r"image/" + imgType + fnImg = "img%d.%s" % (self.imgindex, 'jpg' if imgType=='jpeg' else imgType) + imgFilenameList.append(fnImg) + yield (imgMime, url, fnImg, content, None, None) + else: #不是图片,有可能是包含图片的网页,抽取里面的图片 + content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) + soup = BeautifulSoup(content, 'lxml') + self.RectifyImageSrcInSoup(soup, opener.realurl) + + #有可能一个网页有多个漫画图片,而且还有干扰项(各种按钮/广告等),所以先全部保存再判断好了 + #列表格式[(url, content),...] + imgContentList = [] + for img in soup.find_all('img'): + imgUrl = img['src'] if 'src' in img.attrs else None + if not imgUrl: + continue + + #为了省时间,如果图片属性中有width/height,则也可以先初步判断是不是漫画图片 + if 'width' in img.attrs: + width = img.attrs['width'].replace('"', '').replace("'", '').replace('px', '').strip() + try: + if int(width) < min_width: + continue + except: + pass + + if 'height' in img.attrs: + height = img.attrs['height'].replace('"', '').replace("'", '').replace('px', '').strip() + try: + if int(height) < min_height: + continue + except: + pass + + imgResult = opener.open(imgUrl) + if imgResult.status_code == 200 and imgResult.content: + imgContentList.append((imgUrl, imgResult.content)) + + #判断图片里面哪些是真正的漫画图片 + if not imgContentList: + continue + elif len(imgContentList) == 1: + imgUrl, imgContent = imgContentList[0] + imgType = imghdr.what(None, imgContent) + if imgType: + imgMime = r"image/" + imgType + fnImg = "img%d.%s" % (self.imgindex, 'jpg' if imgType=='jpeg' else imgType) + imgFilenameList.append(fnImg) + yield (imgMime, imgUrl, fnImg, imgContent, None, None) + else: #多个图片,要分析哪些才是漫画 + isComics = [True for n in range(len(imgContentList))] + for idx, imgItem in enumerate(imgContentList): + imgUrl, imgContent = imgItem + imgInstance = Image.open(StringIO(imgContent)) + width, height = imgInstance.size + #图片太小则排除 + if width < min_width or height < min_height: + isComics[idx] = False + elif width > height * 4: #一般横幅广告图片都是横长条,可以剔除 + isComics[idx] = False + + #如果所有的图片都被排除了,则使用所有图片里面尺寸最大的 + if not any(isComics): + imgContentList.sort(key=lambda x: len(x[1]), reverse=True) + imgContentList = [imgContentList[0]] + else: + imgContentList = [item for idx, item in enumerate(imgContentList) if isComics[idx]] + + #列表中的就是漫画图片 + for imgUrl, imgContent in imgContentList: + imgType = imghdr.what(None, imgContent) + if imgType: + imgMime = r"image/" + imgType + fnImg = "img%d.%s" % (self.imgindex, 'jpg' if imgType=='jpeg' else imgType) + imgFilenameList.append(fnImg) + yield (imgMime, imgUrl, fnImg, imgContent, None, None) + + #每个图片当做一篇文章,否则全屏模式下图片会挤到同一页 + for imgFilename in imgFilenameList: + tmpHtml = htmlTemplate % (fTitle, imgFilename) + yield (imgFilename.split('.')[0], url, fTitle, tmpHtml, '', None) + #几个小工具函数 def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': diff --git a/books/Lagoon.py b/books/comic/Lagoon.py similarity index 62% rename from books/Lagoon.py rename to books/comic/Lagoon.py index 2d6efe79..d5a70d61 100644 --- a/books/Lagoon.py +++ b/books/comic/Lagoon.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return Lagoon -class Lagoon(BaseComicBook): +class Lagoon(CartoonMadBaseBook): title = u'LetsLagoon' description = u'日本漫画家创作的漫画' language = 'zh-tw' @@ -13,4 +14,4 @@ class Lagoon(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_lagoon.jpg' - mainurl = 'http://www.cartoonmad.com/comic/1473.html' + feeds = [(u'LetsLagoon', 'http://www.cartoonmad.com/comic/1473.html')] diff --git a/books/comic/__init__.py b/books/comic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/books/comic/cartoonmadbase.py b/books/comic/cartoonmadbase.py new file mode 100644 index 00000000..05ae7476 --- /dev/null +++ b/books/comic/cartoonmadbase.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +#http://www.cartoonmad.com网站的漫画的基类,简单提供几个信息实现一个子类即可推送特定的漫画 +import datetime +from bs4 import BeautifulSoup +from config import TIMEZONE +from lib.urlopener import URLOpener +from lib.autodecoder import AutoDecoder +from books.base import BaseComicBook +from apps.dbModels import LastDelivered + +class CartoonMadBaseBook(BaseComicBook): + title = u'' + description = u'' + language = '' + feed_encoding = '' + page_encoding = '' + mastheadfile = '' + coverfile = '' + host = 'http://www.cartoonmad.com' + feeds = [] #子类填充此列表[('name', mainurl),...] + + #使用此函数返回漫画图片列表[(section, title, url, desc),...] + def ParseFeedUrls(self): + urls = [] #用于返回 + + newComicUrls = self.GetNewComic() #返回[(title, num, url),...] + if not newComicUrls: + return [] + + decoder = AutoDecoder(isfeed=False) + for title, num, url in newComicUrls: + opener = URLOpener(self.host, timeout=60) + result = opener.open(url) + if result.status_code != 200 or not result.content: + self.log.warn('fetch comic page failed: %s' % url) + continue + + content = result.content + content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) + + bodySoup = BeautifulSoup(content, 'lxml') + sel = bodySoup.find('select') #页码行,要提取所有的页面 + ul = sel.find_all('option') if sel else None + if not ul: + continue + + for comicPage in ul: + href = comicPage.get('value') + if href: + pageHref = self.urljoin(url, href) + result = opener.open(pageHref) + if result.status_code != 200: + self.log.warn('fetch comic page failed: %s' % pageHref) + continue + + content = result.content + content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) + soup = BeautifulSoup(content, 'lxml') + + comicImgTag = soup.find('img', {'oncontextmenu': 'return false'}) + comicSrc = comicImgTag.get('src') if comicImgTag else None + if comicSrc: + urls.append((title, comicPage.text, comicSrc, None)) + + self.UpdateLastDelivered(title, num) + + return urls + + #更新已经推送的卷序号到数据库 + def UpdateLastDelivered(self, title, num): + userName = self.UserName() + dbItem = LastDelivered.all().filter('username = ', userName).filter('bookname = ', title).get() + self.last_delivered_volume = u' 第%d卷' % num + if dbItem: + dbItem.num = num + dbItem.record = self.last_delivered_volume + dbItem.datetime = datetime.datetime.utcnow() + datetime.timedelta(hours=TIMEZONE) + else: + dbItem = LastDelivered(username=userName, bookname=title, num=num, record=self.last_delivered_volume, + datetime=datetime.datetime.utcnow() + datetime.timedelta(hours=TIMEZONE)) + dbItem.put() + + #根据已经保存的记录查看连载是否有新的章节,返回章节URL列表 + #返回:[(title, num, url),...] + def GetNewComic(self): + urls = [] + + if not self.feeds: + return [] + + userName = self.UserName() + decoder = AutoDecoder(isfeed=False) + for item in self.feeds: + title, url = item[0], item[1] + + lastCount = LastDelivered.all().filter('username = ', userName).filter("bookname = ", title).get() + if not lastCount: + default_log.info('These is no log in db LastDelivered for name: %s, set to 0' % title) + oldNum = 0 + else: + oldNum = lastCount.num + + opener = URLOpener(self.host, timeout=60) + result = opener.open(url) + if result.status_code != 200: + self.log.warn('fetch index page for %s failed[%s] : %s' % (title, URLOpener.CodeMap(result.status_code), url)) + continue + content = result.content + content = self.AutoDecodeContent(content, decoder, self.feed_encoding, opener.realurl, result.headers) + + soup = BeautifulSoup(content, 'lxml') + + allComicTable = soup.find_all('table', {'width': '688'}) + addedForThisComic = False + for comicTable in allComicTable: + comicVolumes = comicTable.find_all('a', {'target': '_blank'}) + for volume in comicVolumes: + texts = volume.text.split(' ') + if len(texts) > 2 and texts[1].isdigit() and volume.get('href'): + num = int(texts[1]) + if num > oldNum: + oldNum = num + href = self.urljoin(self.host, volume.get('href')) + urls.append((title, num, href)) + addedForThisComic = True + break #一次只推送一卷(有时候一卷已经很多图片了) + + if addedForThisComic: + break + + return urls + \ No newline at end of file diff --git a/books/conan.py b/books/comic/conan.py similarity index 62% rename from books/conan.py rename to books/comic/conan.py index 6fcc3ef1..01995cab 100644 --- a/books/conan.py +++ b/books/comic/conan.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return Conan -class Conan(BaseComicBook): +class Conan(CartoonMadBaseBook): title = u'名侦探柯南' description = u'日本漫画家青山刚昌创作的侦探漫画' language = 'zh-tw' @@ -13,4 +14,4 @@ class Conan(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_conan.jpg' - mainurl = 'http://www.cartoonmad.com/comic/1066.html' + feeds = [(u'名侦探柯南', 'http://www.cartoonmad.com/comic/1066.html')] diff --git a/books/fairytail.py b/books/comic/fairytail.py similarity index 62% rename from books/fairytail.py rename to books/comic/fairytail.py index 369968f9..2648b906 100644 --- a/books/fairytail.py +++ b/books/comic/fairytail.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return FairyTail -class FairyTail(BaseComicBook): +class FairyTail(CartoonMadBaseBook): title = u'妖精的尾巴' description = u'日本漫画家真岛浩创作的少年漫画' language = 'zh-tw' @@ -13,4 +14,4 @@ class FairyTail(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_fairytail.jpg' - mainurl = 'http://www.cartoonmad.com/comic/1153.html' + feeds = [(u'妖精的尾巴', 'http://www.cartoonmad.com/comic/1153.html')] diff --git a/books/giant.py b/books/comic/giant.py similarity index 60% rename from books/giant.py rename to books/comic/giant.py index 8973a24b..4cc919e3 100644 --- a/books/giant.py +++ b/books/comic/giant.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return Giant -class Giant(BaseComicBook): +class Giant(CartoonMadBaseBook): title = u'进击的巨人' description = u'諫山創' language = 'zh-tw' @@ -13,4 +14,4 @@ class Giant(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_giant.jpg' - mainurl = 'http://www.cartoonmad.com/comic/1221.html' + feeds = [(u'进击的巨人', 'http://www.cartoonmad.com/comic/1221.html')] diff --git a/books/giant_before.py b/books/comic/giant_before.py similarity index 59% rename from books/giant_before.py rename to books/comic/giant_before.py index 3bc851c1..39b78aaa 100644 --- a/books/giant_before.py +++ b/books/comic/giant_before.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return GiantBefore -class GiantBefore(BaseComicBook): +class GiantBefore(CartoonMadBaseBook): title = u'进击的巨人BeforeTheFall' description = u'諫山創' language = 'zh-tw' @@ -13,4 +14,4 @@ class GiantBefore(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_giant.jpg' - mainurl = 'http://www.cartoonmad.com/comic/3413.html' + feeds = [(u'进击的巨人BeforeTheFall', 'http://www.cartoonmad.com/comic/3413.html')] diff --git a/books/hunter.py b/books/comic/hunter.py similarity index 63% rename from books/hunter.py rename to books/comic/hunter.py index a3a4e2b4..41ad5d89 100644 --- a/books/hunter.py +++ b/books/comic/hunter.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return Hunter -class Hunter(BaseComicBook): +class Hunter(CartoonMadBaseBook): title = u'全职猎人' description = u'日本漫画家富坚义博的一部漫画作品' language = 'zh-tw' @@ -13,4 +14,4 @@ class Hunter(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_hunter.jpg' - mainurl = 'http://www.cartoonmad.com/comic/1155.html' + feeds = [(u'全职猎人', 'http://www.cartoonmad.com/comic/1155.html')] diff --git a/books/myhero.py b/books/comic/myhero.py similarity index 62% rename from books/myhero.py rename to books/comic/myhero.py index edb958ea..c9743788 100644 --- a/books/myhero.py +++ b/books/comic/myhero.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return MyHero -class MyHero(BaseComicBook): +class MyHero(CartoonMadBaseBook): title = u'我的英雄学院' description = u'日本漫画家堀越耕平创作的少年漫画' language = 'zh-tw' @@ -13,4 +14,4 @@ class MyHero(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_myhero.jpg' - mainurl = 'http://www.cartoonmad.com/comic/4085.html' + feeds = [(u'我的英雄学院', 'http://www.cartoonmad.com/comic/4085.html')] diff --git a/books/onepiece.py b/books/comic/onepiece.py similarity index 63% rename from books/onepiece.py rename to books/comic/onepiece.py index 57a8f85d..9f0bf82b 100644 --- a/books/onepiece.py +++ b/books/comic/onepiece.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return Onepiece -class Onepiece(BaseComicBook): +class Onepiece(CartoonMadBaseBook): title = u'海贼王' description = u'日本漫画家尾田荣一郎创作的少年漫画' language = 'zh-tw' @@ -13,4 +14,4 @@ class Onepiece(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_onepiece.jpg' - mainurl = 'http://www.cartoonmad.com/comic/1152.html' + feeds = [(u'海贼王', 'http://www.cartoonmad.com/comic/1152.html')] diff --git a/books/onepunch.py b/books/comic/onepunch.py similarity index 62% rename from books/onepunch.py rename to books/comic/onepunch.py index ef0bf199..386a01ec 100644 --- a/books/onepunch.py +++ b/books/comic/onepunch.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return OnePunch -class OnePunch(BaseComicBook): +class OnePunch(CartoonMadBaseBook): title = u'一拳超人' description = u'日本漫画家One创作的少年漫画' language = 'zh-tw' @@ -13,4 +14,4 @@ class OnePunch(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_onepunch.jpg' - mainurl = 'http://www.cartoonmad.com/comic/3583.html' + feeds = [(u'一拳超人', 'http://www.cartoonmad.com/comic/3583.html')] diff --git a/books/prison.py b/books/comic/prison.py similarity index 62% rename from books/prison.py rename to books/comic/prison.py index acb27e33..69bccb3a 100644 --- a/books/prison.py +++ b/books/comic/prison.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return Prison -class Prison(BaseComicBook): +class Prison(CartoonMadBaseBook): title = u'监狱学园' description = u'日本漫画家Akira创作的少年漫画' language = 'zh-tw' @@ -13,4 +14,4 @@ class Prison(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_prison.jpg' - mainurl = 'http://www.cartoonmad.com/comic/1416.html' + feeds = [(u'监狱学园', 'http://www.cartoonmad.com/comic/1416.html')] diff --git a/books/sevensins.py b/books/comic/sevensins.py similarity index 63% rename from books/sevensins.py rename to books/comic/sevensins.py index f0af9741..675eae2f 100644 --- a/books/sevensins.py +++ b/books/comic/sevensins.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return SevenSins -class SevenSins(BaseComicBook): +class SevenSins(CartoonMadBaseBook): title = u'七大罪' description = u'日本漫画家铃木央创作的少年漫画' language = 'zh-tw' @@ -13,4 +14,4 @@ class SevenSins(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_sevensins.jpg' - mainurl = 'http://www.cartoonmad.com/comic/2504.html' + feeds = [(u'七大罪', 'http://www.cartoonmad.com/comic/2504.html')] diff --git a/books/toyko.py b/books/comic/toyko.py similarity index 64% rename from books/toyko.py rename to books/comic/toyko.py index fedb1b32..73f67d4d 100644 --- a/books/toyko.py +++ b/books/comic/toyko.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return Tokyo -class Tokyo(BaseComicBook): +class Tokyo(CartoonMadBaseBook): title = u'东京食尸鬼re' description = u'日本漫画家石田翠作画的漫画,是前作《东京食尸鬼》的第二部' language = 'zh-tw' @@ -13,4 +14,4 @@ class Tokyo(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_tokyo.jpg' - mainurl = 'http://www.cartoonmad.com/comic/4270.html' + feeds = [(u'东京食尸鬼re', 'http://www.cartoonmad.com/comic/4270.html')] diff --git a/books/yuna.py b/books/comic/yuna.py similarity index 66% rename from books/yuna.py rename to books/comic/yuna.py index 0afcb6f7..a0d6a92d 100644 --- a/books/yuna.py +++ b/books/comic/yuna.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -from base import BaseComicBook +#Author: insert0003 +from .cartoonmadbase import CartoonMadBaseBook def getBook(): return Yuna -class Yuna(BaseComicBook): +class Yuna(CartoonMadBaseBook): title = u'摇曳庄的幽奈小姐' description = u'三浦忠弘(ミウラタダヒロ)创作,2016年2月8日开始连载于《周刊少年JUMP》上的漫画' language = 'zh-tw' @@ -13,4 +14,4 @@ class Yuna(BaseComicBook): page_encoding = 'big5' mastheadfile = 'mh_comic.gif' coverfile = 'cv_yuna.jpg' - mainurl = 'http://www.cartoonmad.com/comic/4897.html' + feeds = [(u'摇曳庄的幽奈小姐', 'http://www.cartoonmad.com/comic/4897.html')] diff --git a/books/weixinbase.py b/books/weixinbase.py deleted file mode 100644 index 05c605b4..00000000 --- a/books/weixinbase.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf-8 -*- -#通过搜狗微信搜索实现特定公众号文章爬取并推送至Kindle -#或者你可以作者(zhu327)的 来将微信公众号转换为RSS在KindleEar中直接订阅 -#Author: -# zhu327 - -import datetime, json, re, time, urlparse, urllib -import lxml.html, lxml.etree -from lib import feedparser -from lib.urlopener import URLOpener -from lib.autodecoder import AutoDecoder -from lib.weixin import process_eqs -from base import BaseFeedBook - -WEIXIN_URL = 'http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid={id}&eqs={eqs}&ekv={ekv}&page=1&t={t}' - -class WeixinBook(BaseFeedBook): - - #继承自BaseFeedBook,参数填写参考BaseFeedBook的注释 - - #每个子类必须重新定义这个属性,为搜狗微信公众搜索地址,例:http://weixin.sogou.com/gzh?openid=oIWsFt6yAL253-qrm9rkdugjSlOY - #每个链接格式为元组:(分节标题, URL) - #注意,如果分节标题是中文的话,增加u前缀,比如 - #(u'沪江英语', 'http://weixin.sogou.com/gzh?openid=oIWsFt6yAL253-qrm9rkdugjSlOY'), - feeds = [] - - def preprocess(self, html): - root = lxml.html.fromstring(html) - - # 抽取封面cover图片 - script = root.xpath('//*[@id="media"]/script/text()') - cover = None - if script: - l = _COVER_RE.findall(script[0]) - if l: - cover = l[0] - - # 抽取文章内容 - try: - content = root.xpath('//*[@id="js_content"]')[0] - except IndexError: - return '' - - # 处理图片链接 - for img in content.xpath('.//img'): - if not 'src' in img.attrib: - img.attrib['src'] = img.attrib.get('data-src', '') - - # 生成封面 - if cover: - coverelement = lxml.etree.Element('img') - coverelement.set('src', cover) - content.insert(0, coverelement) - - return lxml.html.tostring(content, encoding='unicode') - - def ParseFeedUrls(self): - """ return list like [(section,title,url,desc),..] """ - urls = [] - tnow = datetime.datetime.utcnow() - urladded = set() - - for feed in self.feeds: - section, url = feed[0], feed[1] - isfulltext = feed[2] if len(feed) > 2 else False - timeout = self.timeout+10 if isfulltext else self.timeout - opener = URLOpener(self.host, timeout=timeout) - - id = urlparse.urlparse(url).query.split('=')[1] - - result = opener.open(url) - if result.status_code == 200 and result.content: - if self.feed_encoding: - try: - content = result.content.decode(self.feed_encoding) - except UnicodeDecodeError: - content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) - else: - content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) - else: - self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url)) - continue - - eqs, ekv = process_eqs(content) - url = WEIXIN_URL.format(id=id, eqs=urllib.quote(eqs), ekv=ekv, t=int(time.time()*1000)) - - result = opener.open(url) - if result.status_code == 200 and result.content: - if self.feed_encoding: - try: - content = result.content.decode(self.feed_encoding) - except UnicodeDecodeError: - content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) - else: - content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) - content = content[content.find('{'):content.rfind('}')+1] - try: - content = json.loads(content) - except ValueError: - continue - - for e in content['items'][:self.max_articles_per_feed]: - e = feedparser.parse(e)['entries'][0] - updated = None - if hasattr(e, 'lastmodified') and e.lastmodified: - updated = float(e.lastmodified) - - if self.oldest_article > 0 and updated: - updated = datetime.datetime.utcfromtimestamp(updated) - delta = tnow - updated - if self.oldest_article > 365: - threshold = self.oldest_article #以秒为单位 - else: - threshold = 86400*self.oldest_article #以天为单位 - - if delta.days*86400+delta.seconds > threshold: - self.log.info("Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'),e.href)) - continue - - #支持HTTPS - if hasattr(e, 'href'): - if url.startswith('https://'): - urlfeed = e.href.replace('http://','https://') - else: - urlfeed = e.href - - if urlfeed in urladded: - continue - else: - urlfeed = '' - - desc = None - urls.append((section, e.title, urlfeed, desc)) - urladded.add(urlfeed) - else: - self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url)) - - return urls diff --git a/changelog.md b/changelog.md index d3876f03..0c2672a2 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,9 @@ # Changelog for KindleEar +## 1.25 + 1. 添加漫画模式和多本漫画书。 + 2. 添加对Kindle Voyage和Kindle PaperWhite3的支持。 + ## 1.24 1. 新特性:用户可以在GUI界面上直接上传修改封面图片,而且每个账号可单独设置不同的封面图片。 2. bugfix:修正 反“反盗链”特性 无法下载包含unicode字符的URL的图片的一个bug。 diff --git a/changelog_en.md b/changelog_en.md index 8e095723..c2a5aa15 100644 --- a/changelog_en.md +++ b/changelog_en.md @@ -1,5 +1,9 @@ # Changelog for KindleEar +## 1.25 + 1. Added comic mode and several comic books. + 2. Added supported to Kindle Voyage and Kindle PaperWhite3. + ## 1.24 1. New feature: users can upload cover image via by web page directly. 2. bugfix: fix a bug that anti 'anti-pirate-link' feature can't deal with urls which contain unicode characters. diff --git a/helper.py b/helper.py index d7a073e8..4edb0ed5 100644 --- a/helper.py +++ b/helper.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- -""" uploader helper for KindleEar +""" uploader helper for KindleEar It will modify AppId and some other items for you automatically. Configure file 'custom.txt' format (encoding of the file must be ascii): application: YourAppId @@ -8,10 +8,10 @@ timezone: 8 If it not exist, this script will create it in same directory of __file__. """ -import os, re, codecs, locale -__Author__ = 'cdhigh' -__Version__ = '1.3' -__Date__ = '2015-08-20' +import os, sys, re, codecs, locale, shutil +__Author__ = 'cdhigh ' +__Version__ = '1.4' +__Date__ = '2017-09-03' CUSTOM_FILE = 'custom.txt' KE_DIR = 'KindleEar' @@ -21,11 +21,15 @@ PAT_DOMAIN = r"^DOMAIN\s*=\s*[\"\']([\w:/\.-]+)[\"\'](.*)" PAT_TZ = r"^TIMEZONE\s*=\s*?(-{0,1}\d+)(.*)" +try: + input = raw_input +except NameError: + pass #(re)move chinese books to a subdirectory (donot display in webpage) def RemoveChineseBooks(ke_dir): lang = 'zh_CN' - cn_books = [] + cn_books = [] #Relative path saved loc = locale.getdefaultlocale() if loc and len(loc) > 1: lang = loc[0] @@ -36,53 +40,64 @@ def RemoveChineseBooks(ke_dir): books_dir = os.path.join(ke_dir, 'books') if not os.path.exists(books_dir): return - for bkfile in os.listdir(books_dir): - if bkfile.endswith('.py') and not bkfile.startswith('__') and not bkfile.endswith("base.py"): - slbk = [] + list_book_dirs = os.walk(books_dir) + for root, dirs, files in list_book_dirs: + for f in files: + if not f.endswith('.py') or f.startswith('__') or f == 'base.py': + continue + + bkfile = os.path.join(root, f) + rel_path_bkfile = bkfile.replace(books_dir, '').lstrip('/').lstrip('\\') #Relative path + all_lines = [] try: - with codecs.open(os.path.join(books_dir, bkfile), 'r', 'utf-8') as f: - slbk = f.read().split('\n') + with codecs.open(bkfile, 'r', 'utf-8') as f: + all_lines = f.read().split('\n') except: continue - - if not slbk: + + if not all_lines: continue - + iscnbook = False - for line in slbk: + for line in all_lines: ln = line.replace(' ', '').replace('\t', '') - if ln.startswith('title='): #title line + if ln.startswith(('title=', 'description=')): #title line for ch in ln: if u'\u4e00' <= ch <= u'\u9fff': #Chinese Chars iscnbook = True break - if not iscnbook: - break #next book - - if iscnbook: #Is Chinese Book - cn_books.append(os.path.join(books_dir, bkfile)) - #*.pyc exists? - bookname = os.path.splitext(bkfile)[0] - pycfile = os.path.join(books_dir, bookname + '.pyc') + #if not iscnbook: + # break #next book + + if iscnbook: #Is Chinese Book + cn_books.append(rel_path_bkfile) + #*.pyc exists? + if rel_path_bkfile.endswith('.py'): + pycfile = rel_path_bkfile + 'c' if os.path.exists(pycfile): cn_books.append(pycfile) - break #next book - + if not cn_books: return #if exist some Chinese books, then ask for move or not - ret = raw_input('Do you want to remove Chinese books? (y/n)') + ret = input('Do you want to remove Chinese books? (y/n)') if ret not in ('Y', 'YES', 'y', 'yes'): return - + #check and create subdirectory bakdir = os.path.join(books_dir, 'ChineseBooksBak') if not os.path.exists(bakdir): os.makedirs(bakdir) - + for book in cn_books: - dst = os.path.join(bakdir, os.path.basename(book)) + dst = os.path.join(bakdir, book) + dst_dir = os.path.dirname(dst) #create dst directory + if not os.path.exists(dst_dir): + try: + os.makedirs(dst_dir) + except: + pass if os.path.exists(dst): #dst exist, try to remove it firstly. try: os.remove(dst) @@ -91,19 +106,30 @@ def RemoveChineseBooks(ke_dir): #remove book to bak directory try: - os.rename(book, dst) + shutil.move(os.path.join(books_dir, book), dst) except: try: - os.remove(book) + os.remove(os.path.join(books_dir, book)) except: pass - + + #Delete __init__.py of directory backup + list_bak_dir = os.walk(bakdir) + for root, dirs, files in list_bak_dir: + for f in files: + if f == '__init__.py' or f == '__init__.pyc': + #try: + os.remove(os.path.join(root, f)) + #except: + # pass + def Main(): #Searching for KindleEar folder ke_dir = os.path.join(os.path.dirname(__file__), KE_DIR) kem_dir = os.path.join(os.path.dirname(__file__), KE_MASTER_DIR) kemm_dir = os.path.join(kem_dir, KE_MASTER_DIR) - dirs = filter(os.path.exists, (ke_dir, kemm_dir, kem_dir)) + keup_dir = os.path.join(os.path.dirname(__file__), '..', KE_DIR) + dirs = list(filter(os.path.exists, (ke_dir, kemm_dir, kem_dir, keup_dir))) if not dirs: print("Cant found folder 'KindleEar'! Please download it from github firstly.") return 1 @@ -168,15 +194,15 @@ def Main(): elif line.lower().startswith('timezone:'): timezone = line[len('timezone:'):].strip() - ret = raw_input('Your custom info :\n\t app id : %s\n\t email : %s\n\ttimezone : %s\nCorrect? (y/n) : '%(app,email,timezone)) + ret = input('Your custom info :\n\t app id : %s\n\t email : %s\n\ttimezone : %s\nCorrect? (y/n) : '%(app,email,timezone)) if ret in ('y', 'yes', 'Y', 'YES'): needinput = False #configure items correct! while 1: if needinput or not all((app, email, timezone)): - new_app = raw_input('Input app id (%s): ' % app) - new_email = raw_input('Input your gmail (%s): ' % email) - new_timezone = raw_input('Input your timezone (%s): ' % timezone) + new_app = input('Input app id (%s): ' % app) + new_email = input('Input your gmail (%s): ' % email) + new_timezone = input('Input your timezone (%s): ' % timezone) app = new_app if new_app else app email = new_email if new_email else email timezone = new_timezone if new_timezone else timezone diff --git a/i18n/tr-tr/LC_MESSAGES/lang.mo b/i18n/tr-tr/LC_MESSAGES/lang.mo index 24a53f4d..38daed0b 100755 Binary files a/i18n/tr-tr/LC_MESSAGES/lang.mo and b/i18n/tr-tr/LC_MESSAGES/lang.mo differ diff --git a/i18n/tr-tr/LC_MESSAGES/lang.po b/i18n/tr-tr/LC_MESSAGES/lang.po index 58a870b3..56ea64de 100644 --- a/i18n/tr-tr/LC_MESSAGES/lang.po +++ b/i18n/tr-tr/LC_MESSAGES/lang.po @@ -88,9 +88,6 @@ msgstr "Ayarlar" msgid "Logs" msgstr "Kayıtlar" -msgid "UpdateLogs" -msgstr "Kayıtlar" - msgid "Admin" msgstr "Yönetim" @@ -623,3 +620,39 @@ msgstr "Failed to delete the cover image. Error:" msgid "Error when try to delete the cover image. Status:" msgstr "Error when try to delete the cover image. Status:" + +msgid "Book mode" +msgstr "Book mode" + +msgid "Periodical" +msgstr "Periodical" + +msgid "Comic" +msgstr "Comic" + +msgid "Please input a new number" +msgstr "Please input a new number" + +msgid "The number is invalid" +msgstr "The number is invalid" + +msgid "Unhappily : cannot change this record, Error:" +msgstr "Unhappily : cannot change this record, Error:" + +msgid "Error when try to change this record. Status:" +msgstr "Error when try to change this record. Status:" + +msgid "Unhappily : cannot delete this record, Error:" +msgstr "Unhappily : cannot delete this record, Error:" + +msgid "Error when try to delete this record. Status:" +msgstr "Error when try to delete this record. Status:" + +msgid "Last delivered" +msgstr "Last delivered" + +msgid "Num" +msgstr "Num" + +msgid "Record" +msgstr "Record" \ No newline at end of file diff --git a/i18n/zh-cn/LC_MESSAGES/lang.mo b/i18n/zh-cn/LC_MESSAGES/lang.mo index dc867d2b..ee8095e1 100755 Binary files a/i18n/zh-cn/LC_MESSAGES/lang.mo and b/i18n/zh-cn/LC_MESSAGES/lang.mo differ diff --git a/i18n/zh-cn/LC_MESSAGES/lang.po b/i18n/zh-cn/LC_MESSAGES/lang.po index aefed547..cd51da00 100644 --- a/i18n/zh-cn/LC_MESSAGES/lang.po +++ b/i18n/zh-cn/LC_MESSAGES/lang.po @@ -88,9 +88,6 @@ msgstr "设置" msgid "Logs" msgstr "投递日志" -msgid "UpdateLogs" -msgstr "漫画日志" - msgid "Admin" msgstr "账户管理" @@ -143,7 +140,7 @@ msgid "Operation" msgstr "命令" msgid "Change" -msgstr "改密码" +msgstr "修改" msgid "Delete" msgstr "删除" @@ -618,3 +615,39 @@ msgstr "删除封面图片失败。错误信息:" msgid "Error when try to delete the cover image. Status:" msgstr "在试图删除封面图片时出现异常。状态码:" + +msgid "Book mode" +msgstr "书籍模式" + +msgid "Periodical" +msgstr "期刊" + +msgid "Comic" +msgstr "漫画" + +msgid "Please input a new number" +msgstr "请输入一个新的数值" + +msgid "The number is invalid" +msgstr "数值非法" + +msgid "Unhappily : cannot change this record, Error:" +msgstr "非常遗憾:无法修改此记录,错误码:" + +msgid "Error when try to change this record. Status:" +msgstr "在试图修改此记录时出错。错误码:" + +msgid "Unhappily : cannot delete this record, Error:" +msgstr "非常遗憾:无法删除此记录,错误码:" + +msgid "Error when try to delete this record. Status:" +msgstr "在试图删除此记录时出错。错误码:" + +msgid "Last delivered" +msgstr "已推送期号" + +msgid "Num" +msgstr "期号" + +msgid "Record" +msgstr "信息" diff --git a/index.yaml b/index.yaml index 0357fbd1..0603af42 100644 --- a/index.yaml +++ b/index.yaml @@ -25,3 +25,9 @@ indexes: properties: - name: book - name: time + +- kind: LastDelivered + properties: + - name: username + - name: datetime + direction: desc diff --git a/lib/autodecoder.py b/lib/autodecoder.py index 132d22e5..1c715a8a 100644 --- a/lib/autodecoder.py +++ b/lib/autodecoder.py @@ -33,7 +33,7 @@ def __init__(self, isfeed=True): self.encoding = None self.isfeed = isfeed #True:Feed,False:page - def decode(self, content, url, headers=None): + def decode(self, content, url=None, headers=None): if not content: return '' @@ -71,7 +71,7 @@ def decode(self, content, url, headers=None): return self.decode_by_chardet(content, url) - def decode_by_chardet(self, content, url): + def decode_by_chardet(self, content, url=None): """有双级缓存的解码器 第一级缓存是上一篇文章的编码,第二级缓存是数据库保存的此网站编码""" result = content @@ -91,23 +91,29 @@ def decode_by_chardet(self, content, url): else: # 保存下次使用,以节省时间 self.encoding = encoding #同时保存到数据库 - netloc = urlparse.urlsplit(url)[1] - urlenc = UrlEncoding.all().filter('netloc = ', netloc).get() - if urlenc: - enc = urlenc.feedenc if self.isfeed else urlenc.pageenc - if enc != encoding: - if self.isfeed: - urlenc.feedenc = encoding - else: - urlenc.pageenc = encoding - urlenc.put() - elif self.isfeed: - UrlEncoding(netloc=netloc,feedenc=encoding).put() - else: - UrlEncoding(netloc=netloc,pageenc=encoding).put() + if url: + netloc = urlparse.urlsplit(url)[1] + urlenc = UrlEncoding.all().filter('netloc = ', netloc).get() + if urlenc: + enc = urlenc.feedenc if self.isfeed else urlenc.pageenc + if enc != encoding: + if self.isfeed: + urlenc.feedenc = encoding + else: + urlenc.pageenc = encoding + urlenc.put() + elif self.isfeed: + UrlEncoding(netloc=netloc,feedenc=encoding).put() + else: + UrlEncoding(netloc=netloc,pageenc=encoding).put() else: # 暂时没有之前的编码信息 - netloc = urlparse.urlsplit(url)[1] - urlenc = UrlEncoding.all().filter('netloc = ', netloc).get() + if url: + netloc = urlparse.urlsplit(url)[1] + urlenc = UrlEncoding.all().filter('netloc = ', netloc).get() + else: + netloc = None + urlenc = None + if urlenc: #先看数据库有没有 enc = urlenc.feedenc if self.isfeed else urlenc.pageenc if enc: @@ -134,12 +140,13 @@ def decode_by_chardet(self, content, url): result = content else: #保存到数据库 - newurlenc = urlenc if urlenc else UrlEncoding(netloc=netloc) - if self.isfeed: - newurlenc.feedenc = self.encoding - else: - newurlenc.pageenc = self.encoding - newurlenc.put() + if url: + newurlenc = urlenc if urlenc else UrlEncoding(netloc=netloc) + if self.isfeed: + newurlenc.feedenc = self.encoding + else: + newurlenc.pageenc = self.encoding + newurlenc.put() default_log.warn('Decoded (%s) by chardet: [%s]' % (self.encoding or 'Unknown Encoding', url)) diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py index f3dd7557..46caac04 100644 --- a/lib/bs4/__init__.py +++ b/lib/bs4/__init__.py @@ -5,26 +5,31 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a (possibly invalid) document into a tree representation. Beautiful Soup -provides provides methods and Pythonic idioms that make it easy to -navigate, search, and modify the parse tree. +provides methods and Pythonic idioms that make it easy to navigate, +search, and modify the parse tree. -Beautiful Soup works with Python 2.6 and up. It works better if lxml +Beautiful Soup works with Python 2.7 and up. It works better if lxml and/or html5lib is installed. For more than you ever wanted to know about Beautiful Soup, see the documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ + """ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.4.1" -__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" +__version__ = "4.5.3" +__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] import os import re +import traceback import warnings from .builder import builder_registry, ParserRejectedMarkup @@ -77,7 +82,7 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, @@ -137,6 +142,10 @@ def deprecated_argument(old_name, new_name): from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") + if from_encoding and isinstance(markup, unicode): + warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") + from_encoding = None + if len(kwargs) > 0: arg = kwargs.keys().pop() raise TypeError( @@ -161,19 +170,29 @@ def deprecated_argument(old_name, new_name): markup_type = "XML" else: markup_type = "HTML" + + caller = traceback.extract_stack()[0] + filename = caller[0] + line_number = caller[1] warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( + filename=filename, + line_number=line_number, parser=builder.NAME, markup_type=markup_type)) self.builder = builder self.is_xml = builder.is_xml + self.known_xml = self.is_xml self.builder.soup = self self.parse_only = parse_only if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() - elif len(markup) <= 256: + elif len(markup) <= 256 and ( + (isinstance(markup, bytes) and not b'<' in markup) + or (isinstance(markup, unicode) and not u'<' in markup) + ): # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, @@ -195,16 +214,10 @@ def deprecated_argument(old_name, new_name): if isinstance(markup, unicode): markup = markup.encode("utf8") warnings.warn( - '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) - if markup[:5] == "http:" or markup[:6] == "https:": - # TODO: This is ugly but I couldn't get it to work in - # Python 3 otherwise. - if ((isinstance(markup, bytes) and not b' ' in markup) - or (isinstance(markup, unicode) and not u' ' in markup)): - if isinstance(markup, unicode): - markup = markup.encode("utf8") - warnings.warn( - '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) + '"%s" looks like a filename, not markup. You should' + 'probably open this file and pass the filehandle into' + 'Beautiful Soup.' % markup) + self._check_markup_is_url(markup) for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( @@ -223,15 +236,52 @@ def deprecated_argument(old_name, new_name): self.builder.soup = None def __copy__(self): - return type(self)(self.encode(), builder=self.builder) + copy = type(self)( + self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' + ) + + # Although we encoded the tree to UTF-8, that may not have + # been the encoding of the original markup. Set the copy's + # .original_encoding to reflect the original object's + # .original_encoding. + copy.original_encoding = self.original_encoding + return copy def __getstate__(self): # Frequently a tree builder can't be pickled. d = dict(self.__dict__) if 'builder' in d and not self.builder.picklable: - del d['builder'] + d['builder'] = None return d + @staticmethod + def _check_markup_is_url(markup): + """ + Check if markup looks like it's actually a url and raise a warning + if so. Markup can be unicode or str (py2) / bytes (py3). + """ + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") + elif isinstance(markup, unicode): + space = u' ' + cant_start_with = (u"http:", u"https:") + else: + return + + if any(markup.startswith(prefix) for prefix in cant_start_with): + if not space in markup: + if isinstance(markup, bytes): + decoded_markup = markup.decode('utf-8', 'replace') + else: + decoded_markup = markup + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an' + ' HTTP client. You should probably use an HTTP client like' + ' requests to get the document behind the URL, and feed' + ' that document to Beautiful Soup.' % decoded_markup + ) + def _feed(self): # Convert the document to Unicode. self.builder.reset() @@ -335,7 +385,18 @@ def object_was_parsed(self, o, parent=None, most_recent_element=None): if parent.next_sibling: # This node is being inserted into an element that has # already been parsed. Deal with any dangling references. - index = parent.contents.index(o) + index = len(parent.contents)-1 + while index >= 0: + if parent.contents[index] is o: + break + index -= 1 + else: + raise ValueError( + "Error building tree: supposedly %r was inserted " + "into %r after the fact, but I don't see it!" % ( + o, parent + ) + ) if index == 0: previous_element = parent previous_sibling = None @@ -387,7 +448,7 @@ def handle_starttag(self, name, namespace, nsprefix, attrs): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the - SoupStrainer. You should proceed as if the tag had not occured + SoupStrainer. You should proceed as if the tag had not occurred in the document. For instance, if this was a self-closing tag, don't call handle_endtag. """ diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py index f8fce568..601979bf 100644 --- a/lib/bs4/builder/__init__.py +++ b/lib/bs4/builder/__init__.py @@ -1,9 +1,13 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + from collections import defaultdict import itertools import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, + HTMLAwareEntitySubstitution, whitespace_re ) @@ -227,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder): Such as which tags are empty-element tags. """ - preserve_whitespace_tags = set(['pre', 'textarea']) + preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py index 8725a658..5f548935 100644 --- a/lib/bs4/builder/_html5lib.py +++ b/lib/bs4/builder/_html5lib.py @@ -1,9 +1,12 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + __all__ = [ 'HTML5TreeBuilder', ] -from pdb import set_trace import warnings +import re from bs4.builder import ( PERMISSIVE, HTML, @@ -15,7 +18,10 @@ whitespace_re, ) import html5lib -from html5lib.constants import namespaces +from html5lib.constants import ( + namespaces, + prefixes, + ) from bs4.element import ( Comment, Doctype, @@ -23,6 +29,15 @@ Tag, ) +try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +except ImportError, e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True + class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" @@ -47,7 +62,14 @@ def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) - doc = parser.parse(markup, encoding=self.user_specified_encoding) + + extra_kwargs = dict() + if not isinstance(markup, unicode): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: + extra_kwargs['encoding'] = self.user_specified_encoding + doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): @@ -55,11 +77,17 @@ def feed(self, markup): # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: - doc.original_encoding = parser.tokenizer.stream.charEncoding[0] + original_encoding = parser.tokenizer.stream.charEncoding[0] + if not isinstance(original_encoding, basestring): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( - self.soup, namespaceHTMLElements) + namespaceHTMLElements, self.soup) return self.underlying_builder def test_fragment_to_document(self, fragment): @@ -67,10 +95,14 @@ def test_fragment_to_document(self, fragment): return u'%s' % fragment -class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): - def __init__(self, soup, namespaceHTMLElements): - self.soup = soup + def __init__(self, namespaceHTMLElements, soup=None): + if soup: + self.soup = soup + else: + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): @@ -93,7 +125,8 @@ def commentClass(self, data): return TextNode(Comment(data), self.soup) def fragmentClass(self): - self.soup = BeautifulSoup("") + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) @@ -105,7 +138,57 @@ def getDocument(self): return self.soup def getFragment(self): - return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element + return treebuilder_base.TreeBuilder.getFragment(self).element + + def testSerializer(self, element): + from bs4 import BeautifulSoup + rv = [] + doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') + + def serializeElement(element, indent=0): + if isinstance(element, BeautifulSoup): + pass + if isinstance(element, Doctype): + m = doctype_re.match(element) + if m: + name = m.group(1) + if m.lastindex > 1: + publicId = m.group(2) or "" + systemId = m.group(3) or m.group(4) or "" + rv.append("""|%s""" % + (' ' * indent, name, publicId, systemId)) + else: + rv.append("|%s" % (' ' * indent, name)) + else: + rv.append("|%s" % (' ' * indent,)) + elif isinstance(element, Comment): + rv.append("|%s" % (' ' * indent, element)) + elif isinstance(element, NavigableString): + rv.append("|%s\"%s\"" % (' ' * indent, element)) + else: + if element.namespace: + name = "%s %s" % (prefixes[element.namespace], + element.name) + else: + name = element.name + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] + for name, value in element.attrs.items(): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): + value = " ".join(value) + attributes.append((name, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) + indent += 2 + for child in element.children: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) class AttrList(object): def __init__(self, element): @@ -137,9 +220,9 @@ def __contains__(self, name): return name in list(self.attrs.keys()) -class Element(html5lib.treebuilders._base.Node): +class Element(treebuilder_base.Node): def __init__(self, element, soup, namespace): - html5lib.treebuilders._base.Node.__init__(self, element.name) + treebuilder_base.Node.__init__(self, element.name) self.element = element self.soup = soup self.namespace = namespace @@ -158,8 +241,10 @@ def appendChild(self, node): child = node elif node.element.__class__ == NavigableString: string_child = child = node.element + node.parent = self else: child = node.element + node.parent = self if not isinstance(child, basestring) and child.parent is not None: node.element.extract() @@ -197,6 +282,8 @@ def appendChild(self, node): most_recent_element=most_recent_element) def getAttributes(self): + if isinstance(self.element, Comment): + return {} return AttrList(self.element) def setAttributes(self, attributes): @@ -224,11 +311,11 @@ def setAttributes(self, attributes): attributes = property(getAttributes, setAttributes) def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) if insertBefore: - text = TextNode(self.soup.new_string(data), self.soup) - self.insertBefore(data, insertBefore) + self.insertBefore(text, insertBefore) else: - self.appendChild(data) + self.appendChild(text) def insertBefore(self, node, refNode): index = self.element.index(refNode.element) @@ -250,6 +337,7 @@ def reparentChildren(self, new_parent): # print "MOVE", self.element.contents # print "FROM", self.element # print "TO", new_parent.element + element = self.element new_parent_element = new_parent.element # Determine what this tag's next_element will be once all the children @@ -268,7 +356,6 @@ def reparentChildren(self, new_parent): new_parents_last_descendant_next_element = new_parent_element.next_element to_append = element.contents - append_after = new_parent_element.contents if len(to_append) > 0: # Set the first child's previous_element and previous_sibling # to elements within the new parent @@ -285,12 +372,19 @@ def reparentChildren(self, new_parent): if new_parents_last_child: new_parents_last_child.next_sibling = first_child - # Fix the last child's next_element and next_sibling - last_child = to_append[-1] - last_child.next_element = new_parents_last_descendant_next_element + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant(False, True) + + last_childs_last_descendant.next_element = new_parents_last_descendant_next_element if new_parents_last_descendant_next_element: - new_parents_last_descendant_next_element.previous_element = last_child - last_child.next_sibling = None + # TODO: This code has no test coverage and I'm not sure + # how to get html5lib to go through this path, but it's + # just the other side of the previous line. + new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant + last_childs_last_descendant.next_sibling = None for child in to_append: child.parent = new_parent_element @@ -324,7 +418,7 @@ def getNameTuple(self): class TextNode(Element): def __init__(self, element, soup): - html5lib.treebuilders._base.Node.__init__(self, None) + treebuilder_base.Node.__init__(self, None) self.element = element self.soup = soup diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py index 9e8f88fb..d2ca2872 100644 --- a/lib/bs4/builder/_lxml.py +++ b/lib/bs4/builder/_lxml.py @@ -1,3 +1,5 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. __all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', @@ -12,6 +14,7 @@ Doctype, NamespacedAttribute, ProcessingInstruction, + XMLProcessingInstruction, ) from bs4.builder import ( FAST, @@ -29,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser is_xml = True + processing_instruction_class = XMLProcessingInstruction NAME = "lxml-xml" ALTERNATE_NAMES = ["xml"] @@ -87,6 +91,16 @@ def prepare_markup(self, markup, user_specified_encoding=None, Each 4-tuple represents a strategy for parsing the document. """ + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + if isinstance(markup, unicode): # We were given Unicode. Maybe lxml can parse Unicode on # this system? @@ -98,11 +112,6 @@ def prepare_markup(self, markup, user_specified_encoding=None, yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) - # Instead of using UnicodeDammit to convert the bytestring to - # Unicode using different encodings, use EncodingDetector to - # iterate over the encodings, and tell lxml to try to parse - # the document as each one in turn. - is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) @@ -201,7 +210,7 @@ def end(self, name): def pi(self, target, data): self.soup.endData() self.soup.handle_data(target + ' ' + data) - self.soup.endData(ProcessingInstruction) + self.soup.endData(self.processing_instruction_class) def data(self, content): self.soup.handle_data(content) @@ -229,6 +238,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] is_xml = False + processing_instruction_class = ProcessingInstruction def default_parser(self, encoding): return etree.HTMLParser diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py index 636f81b4..7965565f 100644 --- a/lib/bs4/dammit.py +++ b/lib/bs4/dammit.py @@ -6,9 +6,10 @@ Feed Parser. It works best on XML and HTML, but it does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. __license__ = "MIT" -from pdb import set_trace import codecs from htmlentitydefs import codepoint2name import re @@ -309,7 +310,7 @@ def find_declared_encoding(cls, markup, is_html=False, search_entire_document=Fa else: xml_endpos = 1024 html_endpos = max(2048, int(len(markup) * 0.05)) - + declared_encoding = None declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) if not declared_encoding_match and is_html: @@ -346,7 +347,7 @@ def __init__(self, markup, override_encodings=[], self.tried_encodings = [] self.contains_replacement_characters = False self.is_html = is_html - + self.log = logging.getLogger(__name__) self.detector = EncodingDetector( markup, override_encodings, is_html, exclude_encodings) @@ -376,9 +377,10 @@ def __init__(self, markup, override_encodings=[], if encoding != "ascii": u = self._convert_from(encoding, "replace") if u is not None: - logging.warning( + self.log.warning( "Some characters could not be decoded, and were " - "replaced with REPLACEMENT CHARACTER.") + "replaced with REPLACEMENT CHARACTER." + ) self.contains_replacement_characters = True break @@ -734,7 +736,7 @@ def _codec(self, charset): 0xde : b'\xc3\x9e', # Þ 0xdf : b'\xc3\x9f', # ß 0xe0 : b'\xc3\xa0', # à - 0xe1 : b'\xa1', # á + 0xe1 : b'\xa1', # á 0xe2 : b'\xc3\xa2', # â 0xe3 : b'\xc3\xa3', # ã 0xe4 : b'\xc3\xa4', # ä diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py index c04d23c3..8768332f 100644 --- a/lib/bs4/diagnose.py +++ b/lib/bs4/diagnose.py @@ -1,5 +1,7 @@ """Diagnostic functions, mainly for use when doing tech support.""" +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. __license__ = "MIT" import cProfile @@ -56,7 +58,8 @@ def diagnose(data): data = data.read() elif os.path.exists(data): print '"%s" looks like a filename. Reading data from the file.' % data - data = open(data).read() + with open(data) as fp: + data = fp.read() elif data.startswith("http:") or data.startswith("https:"): print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." diff --git a/lib/bs4/element.py b/lib/bs4/element.py index ecf2b280..b100d18b 100644 --- a/lib/bs4/element.py +++ b/lib/bs4/element.py @@ -1,8 +1,10 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. __license__ = "MIT" -from pdb import set_trace import collections import re +import shlex import sys import warnings from bs4.dammit import EntitySubstitution @@ -99,6 +101,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): preformatted_tags = set(["pre"]) + preserve_whitespace_tags = set(['pre', 'textarea']) + @classmethod def _substitute_if_appropriate(cls, ns, f): if (isinstance(ns, NavigableString) @@ -169,11 +173,19 @@ def _is_xml(self): This is used when mapping a formatter name ("minimal") to an appropriate function (one that performs entity-substitution on - the contents of