'
+__Version__ = '1.4'
+__Date__ = '2017-09-03'
CUSTOM_FILE = 'custom.txt'
KE_DIR = 'KindleEar'
@@ -21,11 +21,15 @@
PAT_DOMAIN = r"^DOMAIN\s*=\s*[\"\']([\w:/\.-]+)[\"\'](.*)"
PAT_TZ = r"^TIMEZONE\s*=\s*?(-{0,1}\d+)(.*)"
+try:
+ input = raw_input
+except NameError:
+ pass
#(re)move chinese books to a subdirectory (donot display in webpage)
def RemoveChineseBooks(ke_dir):
lang = 'zh_CN'
- cn_books = []
+ cn_books = [] #Relative path saved
loc = locale.getdefaultlocale()
if loc and len(loc) > 1:
lang = loc[0]
@@ -36,53 +40,64 @@ def RemoveChineseBooks(ke_dir):
books_dir = os.path.join(ke_dir, 'books')
if not os.path.exists(books_dir):
return
- for bkfile in os.listdir(books_dir):
- if bkfile.endswith('.py') and not bkfile.startswith('__') and not bkfile.endswith("base.py"):
- slbk = []
+ list_book_dirs = os.walk(books_dir)
+ for root, dirs, files in list_book_dirs:
+ for f in files:
+ if not f.endswith('.py') or f.startswith('__') or f == 'base.py':
+ continue
+
+ bkfile = os.path.join(root, f)
+ rel_path_bkfile = bkfile.replace(books_dir, '').lstrip('/').lstrip('\\') #Relative path
+ all_lines = []
try:
- with codecs.open(os.path.join(books_dir, bkfile), 'r', 'utf-8') as f:
- slbk = f.read().split('\n')
+ with codecs.open(bkfile, 'r', 'utf-8') as f:
+ all_lines = f.read().split('\n')
except:
continue
-
- if not slbk:
+
+ if not all_lines:
continue
-
+
iscnbook = False
- for line in slbk:
+ for line in all_lines:
ln = line.replace(' ', '').replace('\t', '')
- if ln.startswith('title='): #title line
+ if ln.startswith(('title=', 'description=')): #title line
for ch in ln:
if u'\u4e00' <= ch <= u'\u9fff': #Chinese Chars
iscnbook = True
break
- if not iscnbook:
- break #next book
-
- if iscnbook: #Is Chinese Book
- cn_books.append(os.path.join(books_dir, bkfile))
- #*.pyc exists?
- bookname = os.path.splitext(bkfile)[0]
- pycfile = os.path.join(books_dir, bookname + '.pyc')
+ #if not iscnbook:
+ # break #next book
+
+ if iscnbook: #Is Chinese Book
+ cn_books.append(rel_path_bkfile)
+ #*.pyc exists?
+ if rel_path_bkfile.endswith('.py'):
+ pycfile = rel_path_bkfile + 'c'
if os.path.exists(pycfile):
cn_books.append(pycfile)
- break #next book
-
+
if not cn_books:
return
#if exist some Chinese books, then ask for move or not
- ret = raw_input('Do you want to remove Chinese books? (y/n)')
+ ret = input('Do you want to remove Chinese books? (y/n)')
if ret not in ('Y', 'YES', 'y', 'yes'):
return
-
+
#check and create subdirectory
bakdir = os.path.join(books_dir, 'ChineseBooksBak')
if not os.path.exists(bakdir):
os.makedirs(bakdir)
-
+
for book in cn_books:
- dst = os.path.join(bakdir, os.path.basename(book))
+ dst = os.path.join(bakdir, book)
+ dst_dir = os.path.dirname(dst) #create dst directory
+ if not os.path.exists(dst_dir):
+ try:
+ os.makedirs(dst_dir)
+ except:
+ pass
if os.path.exists(dst): #dst exist, try to remove it firstly.
try:
os.remove(dst)
@@ -91,19 +106,30 @@ def RemoveChineseBooks(ke_dir):
#remove book to bak directory
try:
- os.rename(book, dst)
+ shutil.move(os.path.join(books_dir, book), dst)
except:
try:
- os.remove(book)
+ os.remove(os.path.join(books_dir, book))
except:
pass
-
+
+ #Delete __init__.py of directory backup
+ list_bak_dir = os.walk(bakdir)
+ for root, dirs, files in list_bak_dir:
+ for f in files:
+ if f == '__init__.py' or f == '__init__.pyc':
+ #try:
+ os.remove(os.path.join(root, f))
+ #except:
+ # pass
+
def Main():
#Searching for KindleEar folder
ke_dir = os.path.join(os.path.dirname(__file__), KE_DIR)
kem_dir = os.path.join(os.path.dirname(__file__), KE_MASTER_DIR)
kemm_dir = os.path.join(kem_dir, KE_MASTER_DIR)
- dirs = filter(os.path.exists, (ke_dir, kemm_dir, kem_dir))
+ keup_dir = os.path.join(os.path.dirname(__file__), '..', KE_DIR)
+ dirs = list(filter(os.path.exists, (ke_dir, kemm_dir, kem_dir, keup_dir)))
if not dirs:
print("Cant found folder 'KindleEar'! Please download it from github firstly.")
return 1
@@ -168,15 +194,15 @@ def Main():
elif line.lower().startswith('timezone:'):
timezone = line[len('timezone:'):].strip()
- ret = raw_input('Your custom info :\n\t app id : %s\n\t email : %s\n\ttimezone : %s\nCorrect? (y/n) : '%(app,email,timezone))
+ ret = input('Your custom info :\n\t app id : %s\n\t email : %s\n\ttimezone : %s\nCorrect? (y/n) : '%(app,email,timezone))
if ret in ('y', 'yes', 'Y', 'YES'):
needinput = False #configure items correct!
while 1:
if needinput or not all((app, email, timezone)):
- new_app = raw_input('Input app id (%s): ' % app)
- new_email = raw_input('Input your gmail (%s): ' % email)
- new_timezone = raw_input('Input your timezone (%s): ' % timezone)
+ new_app = input('Input app id (%s): ' % app)
+ new_email = input('Input your gmail (%s): ' % email)
+ new_timezone = input('Input your timezone (%s): ' % timezone)
app = new_app if new_app else app
email = new_email if new_email else email
timezone = new_timezone if new_timezone else timezone
diff --git a/i18n/tr-tr/LC_MESSAGES/lang.mo b/i18n/tr-tr/LC_MESSAGES/lang.mo
index 24a53f4d..38daed0b 100755
Binary files a/i18n/tr-tr/LC_MESSAGES/lang.mo and b/i18n/tr-tr/LC_MESSAGES/lang.mo differ
diff --git a/i18n/tr-tr/LC_MESSAGES/lang.po b/i18n/tr-tr/LC_MESSAGES/lang.po
index 58a870b3..56ea64de 100644
--- a/i18n/tr-tr/LC_MESSAGES/lang.po
+++ b/i18n/tr-tr/LC_MESSAGES/lang.po
@@ -88,9 +88,6 @@ msgstr "Ayarlar"
msgid "Logs"
msgstr "Kayıtlar"
-msgid "UpdateLogs"
-msgstr "Kayıtlar"
-
msgid "Admin"
msgstr "Yönetim"
@@ -623,3 +620,39 @@ msgstr "Failed to delete the cover image. Error:"
msgid "Error when try to delete the cover image. Status:"
msgstr "Error when try to delete the cover image. Status:"
+
+msgid "Book mode"
+msgstr "Book mode"
+
+msgid "Periodical"
+msgstr "Periodical"
+
+msgid "Comic"
+msgstr "Comic"
+
+msgid "Please input a new number"
+msgstr "Please input a new number"
+
+msgid "The number is invalid"
+msgstr "The number is invalid"
+
+msgid "Unhappily : cannot change this record, Error:"
+msgstr "Unhappily : cannot change this record, Error:"
+
+msgid "Error when try to change this record. Status:"
+msgstr "Error when try to change this record. Status:"
+
+msgid "Unhappily : cannot delete this record, Error:"
+msgstr "Unhappily : cannot delete this record, Error:"
+
+msgid "Error when try to delete this record. Status:"
+msgstr "Error when try to delete this record. Status:"
+
+msgid "Last delivered"
+msgstr "Last delivered"
+
+msgid "Num"
+msgstr "Num"
+
+msgid "Record"
+msgstr "Record"
\ No newline at end of file
diff --git a/i18n/zh-cn/LC_MESSAGES/lang.mo b/i18n/zh-cn/LC_MESSAGES/lang.mo
index dc867d2b..ee8095e1 100755
Binary files a/i18n/zh-cn/LC_MESSAGES/lang.mo and b/i18n/zh-cn/LC_MESSAGES/lang.mo differ
diff --git a/i18n/zh-cn/LC_MESSAGES/lang.po b/i18n/zh-cn/LC_MESSAGES/lang.po
index aefed547..cd51da00 100644
--- a/i18n/zh-cn/LC_MESSAGES/lang.po
+++ b/i18n/zh-cn/LC_MESSAGES/lang.po
@@ -88,9 +88,6 @@ msgstr "设置"
msgid "Logs"
msgstr "投递日志"
-msgid "UpdateLogs"
-msgstr "漫画日志"
-
msgid "Admin"
msgstr "账户管理"
@@ -143,7 +140,7 @@ msgid "Operation"
msgstr "命令"
msgid "Change"
-msgstr "改密码"
+msgstr "修改"
msgid "Delete"
msgstr "删除"
@@ -618,3 +615,39 @@ msgstr "删除封面图片失败。错误信息:"
msgid "Error when try to delete the cover image. Status:"
msgstr "在试图删除封面图片时出现异常。状态码:"
+
+msgid "Book mode"
+msgstr "书籍模式"
+
+msgid "Periodical"
+msgstr "期刊"
+
+msgid "Comic"
+msgstr "漫画"
+
+msgid "Please input a new number"
+msgstr "请输入一个新的数值"
+
+msgid "The number is invalid"
+msgstr "数值非法"
+
+msgid "Unhappily : cannot change this record, Error:"
+msgstr "非常遗憾:无法修改此记录,错误码:"
+
+msgid "Error when try to change this record. Status:"
+msgstr "在试图修改此记录时出错。错误码:"
+
+msgid "Unhappily : cannot delete this record, Error:"
+msgstr "非常遗憾:无法删除此记录,错误码:"
+
+msgid "Error when try to delete this record. Status:"
+msgstr "在试图删除此记录时出错。错误码:"
+
+msgid "Last delivered"
+msgstr "已推送期号"
+
+msgid "Num"
+msgstr "期号"
+
+msgid "Record"
+msgstr "信息"
diff --git a/index.yaml b/index.yaml
index 0357fbd1..0603af42 100644
--- a/index.yaml
+++ b/index.yaml
@@ -25,3 +25,9 @@ indexes:
properties:
- name: book
- name: time
+
+- kind: LastDelivered
+ properties:
+ - name: username
+ - name: datetime
+ direction: desc
diff --git a/lib/autodecoder.py b/lib/autodecoder.py
index 132d22e5..1c715a8a 100644
--- a/lib/autodecoder.py
+++ b/lib/autodecoder.py
@@ -33,7 +33,7 @@ def __init__(self, isfeed=True):
self.encoding = None
self.isfeed = isfeed #True:Feed,False:page
- def decode(self, content, url, headers=None):
+ def decode(self, content, url=None, headers=None):
if not content:
return ''
@@ -71,7 +71,7 @@ def decode(self, content, url, headers=None):
return self.decode_by_chardet(content, url)
- def decode_by_chardet(self, content, url):
+ def decode_by_chardet(self, content, url=None):
"""有双级缓存的解码器
第一级缓存是上一篇文章的编码,第二级缓存是数据库保存的此网站编码"""
result = content
@@ -91,23 +91,29 @@ def decode_by_chardet(self, content, url):
else: # 保存下次使用,以节省时间
self.encoding = encoding
#同时保存到数据库
- netloc = urlparse.urlsplit(url)[1]
- urlenc = UrlEncoding.all().filter('netloc = ', netloc).get()
- if urlenc:
- enc = urlenc.feedenc if self.isfeed else urlenc.pageenc
- if enc != encoding:
- if self.isfeed:
- urlenc.feedenc = encoding
- else:
- urlenc.pageenc = encoding
- urlenc.put()
- elif self.isfeed:
- UrlEncoding(netloc=netloc,feedenc=encoding).put()
- else:
- UrlEncoding(netloc=netloc,pageenc=encoding).put()
+ if url:
+ netloc = urlparse.urlsplit(url)[1]
+ urlenc = UrlEncoding.all().filter('netloc = ', netloc).get()
+ if urlenc:
+ enc = urlenc.feedenc if self.isfeed else urlenc.pageenc
+ if enc != encoding:
+ if self.isfeed:
+ urlenc.feedenc = encoding
+ else:
+ urlenc.pageenc = encoding
+ urlenc.put()
+ elif self.isfeed:
+ UrlEncoding(netloc=netloc,feedenc=encoding).put()
+ else:
+ UrlEncoding(netloc=netloc,pageenc=encoding).put()
else: # 暂时没有之前的编码信息
- netloc = urlparse.urlsplit(url)[1]
- urlenc = UrlEncoding.all().filter('netloc = ', netloc).get()
+ if url:
+ netloc = urlparse.urlsplit(url)[1]
+ urlenc = UrlEncoding.all().filter('netloc = ', netloc).get()
+ else:
+ netloc = None
+ urlenc = None
+
if urlenc: #先看数据库有没有
enc = urlenc.feedenc if self.isfeed else urlenc.pageenc
if enc:
@@ -134,12 +140,13 @@ def decode_by_chardet(self, content, url):
result = content
else:
#保存到数据库
- newurlenc = urlenc if urlenc else UrlEncoding(netloc=netloc)
- if self.isfeed:
- newurlenc.feedenc = self.encoding
- else:
- newurlenc.pageenc = self.encoding
- newurlenc.put()
+ if url:
+ newurlenc = urlenc if urlenc else UrlEncoding(netloc=netloc)
+ if self.isfeed:
+ newurlenc.feedenc = self.encoding
+ else:
+ newurlenc.pageenc = self.encoding
+ newurlenc.put()
default_log.warn('Decoded (%s) by chardet: [%s]' % (self.encoding or 'Unknown Encoding', url))
diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py
index f3dd7557..46caac04 100644
--- a/lib/bs4/__init__.py
+++ b/lib/bs4/__init__.py
@@ -5,26 +5,31 @@
Beautiful Soup uses a pluggable XML or HTML parser to parse a
(possibly invalid) document into a tree representation. Beautiful Soup
-provides provides methods and Pythonic idioms that make it easy to
-navigate, search, and modify the parse tree.
+provides methods and Pythonic idioms that make it easy to navigate,
+search, and modify the parse tree.
-Beautiful Soup works with Python 2.6 and up. It works better if lxml
+Beautiful Soup works with Python 2.7 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
documentation:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
+
"""
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.4.1"
-__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
+__version__ = "4.5.3"
+__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
import os
import re
+import traceback
import warnings
from .builder import builder_registry, ParserRejectedMarkup
@@ -77,7 +82,7 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
- NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+ NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
@@ -137,6 +142,10 @@ def deprecated_argument(old_name, new_name):
from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding")
+ if from_encoding and isinstance(markup, unicode):
+ warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
+ from_encoding = None
+
if len(kwargs) > 0:
arg = kwargs.keys().pop()
raise TypeError(
@@ -161,19 +170,29 @@ def deprecated_argument(old_name, new_name):
markup_type = "XML"
else:
markup_type = "HTML"
+
+ caller = traceback.extract_stack()[0]
+ filename = caller[0]
+ line_number = caller[1]
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
+ filename=filename,
+ line_number=line_number,
parser=builder.NAME,
markup_type=markup_type))
self.builder = builder
self.is_xml = builder.is_xml
+ self.known_xml = self.is_xml
self.builder.soup = self
self.parse_only = parse_only
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
- elif len(markup) <= 256:
+ elif len(markup) <= 256 and (
+ (isinstance(markup, bytes) and not b'<' in markup)
+ or (isinstance(markup, unicode) and not u'<' in markup)
+ ):
# Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
@@ -195,16 +214,10 @@ def deprecated_argument(old_name, new_name):
if isinstance(markup, unicode):
markup = markup.encode("utf8")
warnings.warn(
- '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
- if markup[:5] == "http:" or markup[:6] == "https:":
- # TODO: This is ugly but I couldn't get it to work in
- # Python 3 otherwise.
- if ((isinstance(markup, bytes) and not b' ' in markup)
- or (isinstance(markup, unicode) and not u' ' in markup)):
- if isinstance(markup, unicode):
- markup = markup.encode("utf8")
- warnings.warn(
- '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
+ '"%s" looks like a filename, not markup. You should'
+ 'probably open this file and pass the filehandle into'
+ 'Beautiful Soup.' % markup)
+ self._check_markup_is_url(markup)
for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in (
@@ -223,15 +236,52 @@ def deprecated_argument(old_name, new_name):
self.builder.soup = None
def __copy__(self):
- return type(self)(self.encode(), builder=self.builder)
+ copy = type(self)(
+ self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
+ )
+
+ # Although we encoded the tree to UTF-8, that may not have
+ # been the encoding of the original markup. Set the copy's
+ # .original_encoding to reflect the original object's
+ # .original_encoding.
+ copy.original_encoding = self.original_encoding
+ return copy
def __getstate__(self):
# Frequently a tree builder can't be pickled.
d = dict(self.__dict__)
if 'builder' in d and not self.builder.picklable:
- del d['builder']
+ d['builder'] = None
return d
+ @staticmethod
+ def _check_markup_is_url(markup):
+ """
+ Check if markup looks like it's actually a url and raise a warning
+ if so. Markup can be unicode or str (py2) / bytes (py3).
+ """
+ if isinstance(markup, bytes):
+ space = b' '
+ cant_start_with = (b"http:", b"https:")
+ elif isinstance(markup, unicode):
+ space = u' '
+ cant_start_with = (u"http:", u"https:")
+ else:
+ return
+
+ if any(markup.startswith(prefix) for prefix in cant_start_with):
+ if not space in markup:
+ if isinstance(markup, bytes):
+ decoded_markup = markup.decode('utf-8', 'replace')
+ else:
+ decoded_markup = markup
+ warnings.warn(
+ '"%s" looks like a URL. Beautiful Soup is not an'
+ ' HTTP client. You should probably use an HTTP client like'
+ ' requests to get the document behind the URL, and feed'
+ ' that document to Beautiful Soup.' % decoded_markup
+ )
+
def _feed(self):
# Convert the document to Unicode.
self.builder.reset()
@@ -335,7 +385,18 @@ def object_was_parsed(self, o, parent=None, most_recent_element=None):
if parent.next_sibling:
# This node is being inserted into an element that has
# already been parsed. Deal with any dangling references.
- index = parent.contents.index(o)
+ index = len(parent.contents)-1
+ while index >= 0:
+ if parent.contents[index] is o:
+ break
+ index -= 1
+ else:
+ raise ValueError(
+ "Error building tree: supposedly %r was inserted "
+ "into %r after the fact, but I don't see it!" % (
+ o, parent
+ )
+ )
if index == 0:
previous_element = parent
previous_sibling = None
@@ -387,7 +448,7 @@ def handle_starttag(self, name, namespace, nsprefix, attrs):
"""Push a start tag on to the stack.
If this method returns None, the tag was rejected by the
- SoupStrainer. You should proceed as if the tag had not occured
+ SoupStrainer. You should proceed as if the tag had not occurred
in the document. For instance, if this was a self-closing tag,
don't call handle_endtag.
"""
diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py
index f8fce568..601979bf 100644
--- a/lib/bs4/builder/__init__.py
+++ b/lib/bs4/builder/__init__.py
@@ -1,9 +1,13 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
from collections import defaultdict
import itertools
import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
+ HTMLAwareEntitySubstitution,
whitespace_re
)
@@ -227,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags.
"""
- preserve_whitespace_tags = set(['pre', 'textarea'])
+ preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py
index 8725a658..5f548935 100644
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@@ -1,9 +1,12 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
__all__ = [
'HTML5TreeBuilder',
]
-from pdb import set_trace
import warnings
+import re
from bs4.builder import (
PERMISSIVE,
HTML,
@@ -15,7 +18,10 @@
whitespace_re,
)
import html5lib
-from html5lib.constants import namespaces
+from html5lib.constants import (
+ namespaces,
+ prefixes,
+ )
from bs4.element import (
Comment,
Doctype,
@@ -23,6 +29,15 @@
Tag,
)
+try:
+ # Pre-0.99999999
+ from html5lib.treebuilders import _base as treebuilder_base
+ new_html5lib = False
+except ImportError, e:
+ # 0.99999999 and up
+ from html5lib.treebuilders import base as treebuilder_base
+ new_html5lib = True
+
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
@@ -47,7 +62,14 @@ def feed(self, markup):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
- doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+ extra_kwargs = dict()
+ if not isinstance(markup, unicode):
+ if new_html5lib:
+ extra_kwargs['override_encoding'] = self.user_specified_encoding
+ else:
+ extra_kwargs['encoding'] = self.user_specified_encoding
+ doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
@@ -55,11 +77,17 @@ def feed(self, markup):
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
- doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+ original_encoding = parser.tokenizer.stream.charEncoding[0]
+ if not isinstance(original_encoding, basestring):
+ # In 0.99999999 and up, the encoding is an html5lib
+ # Encoding object. We want to use a string for compatibility
+ # with other tree builders.
+ original_encoding = original_encoding.name
+ doc.original_encoding = original_encoding
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
- self.soup, namespaceHTMLElements)
+ namespaceHTMLElements, self.soup)
return self.underlying_builder
def test_fragment_to_document(self, fragment):
@@ -67,10 +95,14 @@ def test_fragment_to_document(self, fragment):
return u'%s' % fragment
-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
- def __init__(self, soup, namespaceHTMLElements):
- self.soup = soup
+ def __init__(self, namespaceHTMLElements, soup=None):
+ if soup:
+ self.soup = soup
+ else:
+ from bs4 import BeautifulSoup
+ self.soup = BeautifulSoup("", "html.parser")
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
@@ -93,7 +125,8 @@ def commentClass(self, data):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
- self.soup = BeautifulSoup("")
+ from bs4 import BeautifulSoup
+ self.soup = BeautifulSoup("", "html.parser")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
@@ -105,7 +138,57 @@ def getDocument(self):
return self.soup
def getFragment(self):
- return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+ return treebuilder_base.TreeBuilder.getFragment(self).element
+
+ def testSerializer(self, element):
+ from bs4 import BeautifulSoup
+ rv = []
+ doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
+
+ def serializeElement(element, indent=0):
+ if isinstance(element, BeautifulSoup):
+ pass
+ if isinstance(element, Doctype):
+ m = doctype_re.match(element)
+ if m:
+ name = m.group(1)
+ if m.lastindex > 1:
+ publicId = m.group(2) or ""
+ systemId = m.group(3) or m.group(4) or ""
+ rv.append("""|%s""" %
+ (' ' * indent, name, publicId, systemId))
+ else:
+ rv.append("|%s" % (' ' * indent, name))
+ else:
+ rv.append("|%s" % (' ' * indent,))
+ elif isinstance(element, Comment):
+ rv.append("|%s" % (' ' * indent, element))
+ elif isinstance(element, NavigableString):
+ rv.append("|%s\"%s\"" % (' ' * indent, element))
+ else:
+ if element.namespace:
+ name = "%s %s" % (prefixes[element.namespace],
+ element.name)
+ else:
+ name = element.name
+ rv.append("|%s<%s>" % (' ' * indent, name))
+ if element.attrs:
+ attributes = []
+ for name, value in element.attrs.items():
+ if isinstance(name, NamespacedAttribute):
+ name = "%s %s" % (prefixes[name.namespace], name.name)
+ if isinstance(value, list):
+ value = " ".join(value)
+ attributes.append((name, value))
+
+ for name, value in sorted(attributes):
+ rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+ indent += 2
+ for child in element.children:
+ serializeElement(child, indent)
+ serializeElement(element, 0)
+
+ return "\n".join(rv)
class AttrList(object):
def __init__(self, element):
@@ -137,9 +220,9 @@ def __contains__(self, name):
return name in list(self.attrs.keys())
-class Element(html5lib.treebuilders._base.Node):
+class Element(treebuilder_base.Node):
def __init__(self, element, soup, namespace):
- html5lib.treebuilders._base.Node.__init__(self, element.name)
+ treebuilder_base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
@@ -158,8 +241,10 @@ def appendChild(self, node):
child = node
elif node.element.__class__ == NavigableString:
string_child = child = node.element
+ node.parent = self
else:
child = node.element
+ node.parent = self
if not isinstance(child, basestring) and child.parent is not None:
node.element.extract()
@@ -197,6 +282,8 @@ def appendChild(self, node):
most_recent_element=most_recent_element)
def getAttributes(self):
+ if isinstance(self.element, Comment):
+ return {}
return AttrList(self.element)
def setAttributes(self, attributes):
@@ -224,11 +311,11 @@ def setAttributes(self, attributes):
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
+ text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore:
- text = TextNode(self.soup.new_string(data), self.soup)
- self.insertBefore(data, insertBefore)
+ self.insertBefore(text, insertBefore)
else:
- self.appendChild(data)
+ self.appendChild(text)
def insertBefore(self, node, refNode):
index = self.element.index(refNode.element)
@@ -250,6 +337,7 @@ def reparentChildren(self, new_parent):
# print "MOVE", self.element.contents
# print "FROM", self.element
# print "TO", new_parent.element
+
element = self.element
new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children
@@ -268,7 +356,6 @@ def reparentChildren(self, new_parent):
new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents
- append_after = new_parent_element.contents
if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
@@ -285,12 +372,19 @@ def reparentChildren(self, new_parent):
if new_parents_last_child:
new_parents_last_child.next_sibling = first_child
- # Fix the last child's next_element and next_sibling
- last_child = to_append[-1]
- last_child.next_element = new_parents_last_descendant_next_element
+ # Find the very last element being moved. It is now the
+ # parent's last descendant. It has no .next_sibling and
+ # its .next_element is whatever the previous last
+ # descendant had.
+ last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
+
+ last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element:
- new_parents_last_descendant_next_element.previous_element = last_child
- last_child.next_sibling = None
+ # TODO: This code has no test coverage and I'm not sure
+ # how to get html5lib to go through this path, but it's
+ # just the other side of the previous line.
+ new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
+ last_childs_last_descendant.next_sibling = None
for child in to_append:
child.parent = new_parent_element
@@ -324,7 +418,7 @@ def getNameTuple(self):
class TextNode(Element):
def __init__(self, element, soup):
- html5lib.treebuilders._base.Node.__init__(self, None)
+ treebuilder_base.Node.__init__(self, None)
self.element = element
self.soup = soup
diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py
index 9e8f88fb..d2ca2872 100644
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@@ -1,3 +1,5 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
__all__ = [
'LXMLTreeBuilderForXML',
'LXMLTreeBuilder',
@@ -12,6 +14,7 @@
Doctype,
NamespacedAttribute,
ProcessingInstruction,
+ XMLProcessingInstruction,
)
from bs4.builder import (
FAST,
@@ -29,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
is_xml = True
+ processing_instruction_class = XMLProcessingInstruction
NAME = "lxml-xml"
ALTERNATE_NAMES = ["xml"]
@@ -87,6 +91,16 @@ def prepare_markup(self, markup, user_specified_encoding=None,
Each 4-tuple represents a strategy for parsing the document.
"""
+ # Instead of using UnicodeDammit to convert the bytestring to
+ # Unicode using different encodings, use EncodingDetector to
+ # iterate over the encodings, and tell lxml to try to parse
+ # the document as each one in turn.
+ is_html = not self.is_xml
+ if is_html:
+ self.processing_instruction_class = ProcessingInstruction
+ else:
+ self.processing_instruction_class = XMLProcessingInstruction
+
if isinstance(markup, unicode):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
@@ -98,11 +112,6 @@ def prepare_markup(self, markup, user_specified_encoding=None,
yield (markup.encode("utf8"), "utf8",
document_declared_encoding, False)
- # Instead of using UnicodeDammit to convert the bytestring to
- # Unicode using different encodings, use EncodingDetector to
- # iterate over the encodings, and tell lxml to try to parse
- # the document as each one in turn.
- is_html = not self.is_xml
try_encodings = [user_specified_encoding, document_declared_encoding]
detector = EncodingDetector(
markup, try_encodings, is_html, exclude_encodings)
@@ -201,7 +210,7 @@ def end(self, name):
def pi(self, target, data):
self.soup.endData()
self.soup.handle_data(target + ' ' + data)
- self.soup.endData(ProcessingInstruction)
+ self.soup.endData(self.processing_instruction_class)
def data(self, content):
self.soup.handle_data(content)
@@ -229,6 +238,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
is_xml = False
+ processing_instruction_class = ProcessingInstruction
def default_parser(self, encoding):
return etree.HTMLParser
diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py
index 636f81b4..7965565f 100644
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@@ -6,9 +6,10 @@
Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
__license__ = "MIT"
-from pdb import set_trace
import codecs
from htmlentitydefs import codepoint2name
import re
@@ -309,7 +310,7 @@ def find_declared_encoding(cls, markup, is_html=False, search_entire_document=Fa
else:
xml_endpos = 1024
html_endpos = max(2048, int(len(markup) * 0.05))
-
+
declared_encoding = None
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
if not declared_encoding_match and is_html:
@@ -346,7 +347,7 @@ def __init__(self, markup, override_encodings=[],
self.tried_encodings = []
self.contains_replacement_characters = False
self.is_html = is_html
-
+ self.log = logging.getLogger(__name__)
self.detector = EncodingDetector(
markup, override_encodings, is_html, exclude_encodings)
@@ -376,9 +377,10 @@ def __init__(self, markup, override_encodings=[],
if encoding != "ascii":
u = self._convert_from(encoding, "replace")
if u is not None:
- logging.warning(
+ self.log.warning(
"Some characters could not be decoded, and were "
- "replaced with REPLACEMENT CHARACTER.")
+ "replaced with REPLACEMENT CHARACTER."
+ )
self.contains_replacement_characters = True
break
@@ -734,7 +736,7 @@ def _codec(self, charset):
0xde : b'\xc3\x9e', # Þ
0xdf : b'\xc3\x9f', # ß
0xe0 : b'\xc3\xa0', # à
- 0xe1 : b'\xa1', # á
+ 0xe1 : b'\xa1', # á
0xe2 : b'\xc3\xa2', # â
0xe3 : b'\xc3\xa3', # ã
0xe4 : b'\xc3\xa4', # ä
diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py
index c04d23c3..8768332f 100644
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@@ -1,5 +1,7 @@
"""Diagnostic functions, mainly for use when doing tech support."""
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
__license__ = "MIT"
import cProfile
@@ -56,7 +58,8 @@ def diagnose(data):
data = data.read()
elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
- data = open(data).read()
+ with open(data) as fp:
+ data = fp.read()
elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
diff --git a/lib/bs4/element.py b/lib/bs4/element.py
index ecf2b280..b100d18b 100644
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@@ -1,8 +1,10 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
__license__ = "MIT"
-from pdb import set_trace
import collections
import re
+import shlex
import sys
import warnings
from bs4.dammit import EntitySubstitution
@@ -99,6 +101,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
preformatted_tags = set(["pre"])
+ preserve_whitespace_tags = set(['pre', 'textarea'])
+
@classmethod
def _substitute_if_appropriate(cls, ns, f):
if (isinstance(ns, NavigableString)
@@ -169,11 +173,19 @@ def _is_xml(self):
This is used when mapping a formatter name ("minimal") to an
appropriate function (one that performs entity-substitution on
- the contents of
{% endblock -%}
\ No newline at end of file
diff --git a/templates/setting.html b/templates/setting.html
index 225546eb..82b692e8 100644
--- a/templates/setting.html
+++ b/templates/setting.html
@@ -55,6 +55,8 @@
+
+
@@ -73,6 +75,11 @@
+ : |
+ |