add ckxx, modify kftoutiao

fudongbai · Jan 30, 2021 · 094dddf · 094dddf
1 parent 638a1f1
commit 094dddf
Show file tree

Hide file tree

Showing 3 changed files with 168 additions and 56 deletions.
diff --git a/books/ckxx.py b/books/ckxx.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+from base import BaseFeedBook  # 继承基类BaseFeedBook
+from lib.urlopener import URLOpener  # 导入请求URL获取页面内容的模块
+from bs4 import BeautifulSoup  # 导入BeautifulSoup处理模块
+import re
+from bs4 import element
+
+# 返回此脚本定义的类名
+def getBook():
+    return CKXX
+
+# 继承基类BaseFeedBook
+class CKXX(BaseFeedBook):
+    # 设定生成电子书的元数据
+    title = u'参考消息要闻'  # 设定标题
+    description = u'参考消息头版要闻'  # 设定简介
+    language = 'zh-cn'  # 设定语言
+
+    # 指定要提取的包含文章列表的主题页面链接
+    # 每个主题是包含主题名和主题页面链接的元组
+    feeds = [
+        (u'参考消息要闻', 'http://www.cankaoxiaoxi.com/'),
+    ]
+
+    feed_encoding = "utf-8"
+    page_encoding = 'utf-8'  # 设定待抓取页面的页面编码
+    fulltext_by_readability = False  # 设定手动解析网页
+
+    coverfile = 'cv_ckxx.jpg'  # 设定封面图片
+
+    # 设定内容页需要保留的标签
+    keep_only_tags = [
+        dict(class_='articleHead'),
+        dict(name='div', class_='articleContent'),
+    ]
+
+    # 提取每个主题页面下所有文章URL
+    def ParseFeedUrls(self):
+        urls = []  # 定义一个空的列表用来存放文章元组
+        # 循环处理fees中两个主题页面
+        for feed in self.feeds:
+            # 分别获取元组中主题的名称和链接
+            topic, url = feed[0], feed[1]
+            # 请求主题链接并获取相应内容
+            opener = URLOpener(self.host, timeout=self.timeout)
+            result = opener.open(url)
+            # 如果请求成功，并且页面内容不为空
+            if result.status_code == 200 and result.content:
+                # 将页面内容转换成BeatifulSoup对象
+                soup = BeautifulSoup(result.content, 'html.parser')
+                # 找出当前页面文章列表中所有文章条目'
+                sections = soup.find_all(name='div', class_='column-news')
+                # self.log.warn('find %d sections' % len(sections))
+                for section in sections:
+                    tag = section.find (name='ul', class_='column-title')
+                    sectionName = tag.a.li.string
+                    tuwens = section.find_all (name='div', class_=re.compile("tuwen-block-"))   
+                    # self.log.warn('%s find %d tuwen' % (sectionName, len(tuwens)))
+                    for tuwen in tuwens:
+                        articles = tuwen.find_all ('a')
+                        title = ''
+                        link = ''
+                        for article in articles:
+                            if not article.img:
+                                title = article.string
+                                link = article.get('href')  # 获取文章链接
+                                self.log.warn('title : %s, link: %s' % (title, link))
+                                break
+                        urls.append((sectionName, title, link, None))  # 把文章元组加入列表
+                    texts = section.find_all (name='li', class_=re.compile("list-text-")) 
+                    # self.log.warn('%s find %d texts' % (sectionName, len(texts)))  
+                    for text in texts:                       
+                        title = text.a.string
+                        link = text.a.get('href')  # 获取文章链接
+                        self.log.warn('title : %s, link: %s' % (title, link))
+                        urls.append((sectionName, title, link, None))  # 把文章元组加入列表
+
+            # 如果请求失败通知到日志输出中
+            else:
+                self.log.warn('Fetch article failed(%s):%s' %
+                              (URLOpener.CodeMap(result.status_code), url))
+        # 返回提取到的所有文章列表
+        return urls
+
+    # 清理文章URL附带字符
+    def processtitle(self, title):
+        return title.replace(u'_《参考消息》官方网站', '')
diff --git a/books/kftoutiao.py b/books/kftoutiao.py
@@ -5,6 +5,9 @@
 from lib.urlopener import URLOpener # 导入请求URL获取页面内容的模块
 from bs4 import BeautifulSoup # 导入BeautifulSoup处理模块
 from bs4 import element
+from config import SHARE_FUCK_GFW_SRV
+import urllib
+import string
 
 # 返回此脚本定义的类名
 def getBook():
@@ -31,11 +34,24 @@ class KFTouTiao(BaseFeedBook):
 
     coverfile = 'cv_kftoutiao.jpg' # 设定封面图片
 
-    # 设定内容页需要保留的标签
-    # keep_only_tags = [
-    #     dict(name='rich_media_title', class_='js_content'),
-    #     dict(name='rich_media_conetent', id='js_content'),
-    # ]
+    http_headers = { 'Accept': '*/*','Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'}
+
+    def url4forwarder(self, url):
+        ' 生成经过转发器的URL '
+        return SHARE_FUCK_GFW_SRV % urllib.quote(url)
+
+    def getRealUrl (self, url, try_count = 1):
+        if try_count > 3:
+            return url
+        try:
+            opener = URLOpener(self.host, timeout=self.timeout)
+            result = opener.open(url, None, self.http_headers)
+            if result.status_code > 400:
+                return self.getRealUrl(url, try_count + 1)
+            else:
+                return opener.realurl
+        except:
+            return self.getRealUrl(url, try_count + 1)
 
     # 提取每个主题页面下所有文章URL
     def ParseFeedUrls(self):
@@ -61,7 +77,11 @@ def ParseFeedUrls(self):
                     title = item.a.string # 获取文章标题
                     link = item.a.get('href') # 获取文章链接
                     link = BaseFeedBook.urljoin("https://toutiao.io", link) # 合成文章链接
-                    # self.log.warn('Fetch article : %s' % link)
+                    link = self.getRealUrl (link)
+                    self.log.warn('Fetch article : %s' % link)
+                    if string.find (link, 'zhihu.com') != -1:
+                        link = self.url4forwarder(url)
+                        self.log.warn('transport : %s' % link)                        
                     urls.append((topic, title, link, None)) # 把文章元组加入列表
                     count = count + 1
                     if count >= 30 :
@@ -78,57 +98,60 @@ def preprocess(self, content):
         # 将页面内容转换成BeatifulSoup对象
         soup = BeautifulSoup(content, 'html.parser')
 
-        siteNameTag = soup.find (attrs={"property":"og:site_name"})
-        if siteNameTag :
-            siteName = siteNameTag['content']
+        self.keep_only_tags = []
+        tag = soup.find (attrs={"property":"og:site_name"})
+        if tag :
+            siteName = tag['content']
             # 对微信公众号文章做清洗
-            if siteName and siteName == u'微信公众平台' :
-                #self.log.warn("it's WeChat article.")
-                # 需要填充title字段，否则微信公众号文章会没有标题
-                soup.title.string = soup.find (attrs={"property":"og:title"})['content']
-                # 清除后面的“喜欢此内容的人还喜欢”
-                tag = soup.find (name="div", class_="rich_media_area_extra")
-                if tag :
-                    tag.decompose ()
-
-                # 清除微信扫码
-                tag = soup.find (name="div", class_="qr_code_pc_outer")
-                if tag :
-                    tag.decompose ()
-
-                # 清除文章标签
-                tag = soup.find (name="div", class_="article-tag_list")
-                if tag :
-                    tag.decompose ()
-
-
-                # 清除文章的元数据信息
-                tag = soup.find (name="div", class_="rich_media_meta_list")
-                if tag :
-                    tag.decompose ()
-
-                # 清除打赏信息
-                tag = soup.find (name="div", id="js_reward_area")
-                if tag :
-                    tag.decompose ()
-
-                # 清除工具条信息
-                tag = soup.find (name="div", class_="rich_media_tool")
-                if tag :
-                    tag.decompose ()
-
-                # 清除隐藏信息
-                tags = soup.find_all (name="div", style="display:none;")
-                for tag in tags:
-                    tag.decompose () 
-
-                tags = soup.find_all (name="div", style="display: none;")
-                for tag in tags:
-                    tag.decompose ()                           
-
-        # 返回预处理完成的内容
-        return unicode(soup)
-
+            if siteName:
+                if siteName == u'微信公众平台' :
+                    #self.log.warn("it's WeChat article.")
+                    # 需要填充title字段，否则微信公众号文章会没有标题
+                    soup.title.string = soup.find (attrs={"property":"og:title"})['content']
+
+                    self.keep_only_tags = [
+                        dict(name='div', id="img-content", class_='rich_media_wrp'),
+                        dict(name='div', id="js_content", class_='rich_media_content'),
+                    ]
+
+                    # 清除隐藏信息
+                    tags = soup.find_all (name="div", style="display:none;")
+                    for tag in tags:
+                        tag.decompose () 
+
+                    tags = soup.find_all (name="div", style="display: none;")
+                    for tag in tags:
+                        tag.decompose ()  
+
+                    return unicode(soup)
+
+                # 处理ThoughtWorks洞见文章
+                elif siteName == u'ThoughtWorks洞见':
+                    self.keep_only_tags = [
+                        dict(name='div', class_='entry-wrap'),
+                    ]
+                    return content
+
+        # 处理codingstyle文章
+        tag = soup.find (name='link', rel='alternate')
+        if tag :
+            herfLink = tag['href']
+            if herfLink and string.find (herfLink, 'codingstyle') != -1:
+                self.keep_only_tags = [
+                    dict(name='div', class_='topic-detail'),
+                ]
+                return content
+
+        # 处理开发者头条文章
+        title = soup.title.string
+        if title and string.find (title, u'开发者头条') != -1 :
+            self.keep_only_tags = [
+                dict(name='div', class_='content'),
+                dict(name='div', class_='preview'),
+            ]
+            return content
+
+        return content
 
 
 
diff --git a/images/cv_ckxx.jpg b/images/cv_ckxx.jpg