Skip to content

Commit

Permalink
add ckxx, modify kftoutiao
Browse files Browse the repository at this point in the history
  • Loading branch information
hjianhao committed Jan 30, 2021
1 parent 638a1f1 commit 094dddf
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 56 deletions.
89 changes: 89 additions & 0 deletions books/ckxx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from base import BaseFeedBook # 继承基类BaseFeedBook
from lib.urlopener import URLOpener # 导入请求URL获取页面内容的模块
from bs4 import BeautifulSoup # 导入BeautifulSoup处理模块
import re
from bs4 import element

# 返回此脚本定义的类名
def getBook():
return CKXX

# 继承基类BaseFeedBook
class CKXX(BaseFeedBook):
# 设定生成电子书的元数据
title = u'参考消息要闻' # 设定标题
description = u'参考消息头版要闻' # 设定简介
language = 'zh-cn' # 设定语言

# 指定要提取的包含文章列表的主题页面链接
# 每个主题是包含主题名和主题页面链接的元组
feeds = [
(u'参考消息要闻', 'http://www.cankaoxiaoxi.com/'),
]

feed_encoding = "utf-8"
page_encoding = 'utf-8' # 设定待抓取页面的页面编码
fulltext_by_readability = False # 设定手动解析网页

coverfile = 'cv_ckxx.jpg' # 设定封面图片

# 设定内容页需要保留的标签
keep_only_tags = [
dict(class_='articleHead'),
dict(name='div', class_='articleContent'),
]

# 提取每个主题页面下所有文章URL
def ParseFeedUrls(self):
urls = [] # 定义一个空的列表用来存放文章元组
# 循环处理fees中两个主题页面
for feed in self.feeds:
# 分别获取元组中主题的名称和链接
topic, url = feed[0], feed[1]
# 请求主题链接并获取相应内容
opener = URLOpener(self.host, timeout=self.timeout)
result = opener.open(url)
# 如果请求成功,并且页面内容不为空
if result.status_code == 200 and result.content:
# 将页面内容转换成BeatifulSoup对象
soup = BeautifulSoup(result.content, 'html.parser')
# 找出当前页面文章列表中所有文章条目'
sections = soup.find_all(name='div', class_='column-news')
# self.log.warn('find %d sections' % len(sections))
for section in sections:
tag = section.find (name='ul', class_='column-title')
sectionName = tag.a.li.string
tuwens = section.find_all (name='div', class_=re.compile("tuwen-block-"))
# self.log.warn('%s find %d tuwen' % (sectionName, len(tuwens)))
for tuwen in tuwens:
articles = tuwen.find_all ('a')
title = ''
link = ''
for article in articles:
if not article.img:
title = article.string
link = article.get('href') # 获取文章链接
self.log.warn('title : %s, link: %s' % (title, link))
break
urls.append((sectionName, title, link, None)) # 把文章元组加入列表
texts = section.find_all (name='li', class_=re.compile("list-text-"))
# self.log.warn('%s find %d texts' % (sectionName, len(texts)))
for text in texts:
title = text.a.string
link = text.a.get('href') # 获取文章链接
self.log.warn('title : %s, link: %s' % (title, link))
urls.append((sectionName, title, link, None)) # 把文章元组加入列表

# 如果请求失败通知到日志输出中
else:
self.log.warn('Fetch article failed(%s):%s' %
(URLOpener.CodeMap(result.status_code), url))
# 返回提取到的所有文章列表
return urls

# 清理文章URL附带字符
def processtitle(self, title):
return title.replace(u'_《参考消息》官方网站', '')
135 changes: 79 additions & 56 deletions books/kftoutiao.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from lib.urlopener import URLOpener # 导入请求URL获取页面内容的模块
from bs4 import BeautifulSoup # 导入BeautifulSoup处理模块
from bs4 import element
from config import SHARE_FUCK_GFW_SRV
import urllib
import string

# 返回此脚本定义的类名
def getBook():
Expand All @@ -31,11 +34,24 @@ class KFTouTiao(BaseFeedBook):

coverfile = 'cv_kftoutiao.jpg' # 设定封面图片

# 设定内容页需要保留的标签
# keep_only_tags = [
# dict(name='rich_media_title', class_='js_content'),
# dict(name='rich_media_conetent', id='js_content'),
# ]
http_headers = { 'Accept': '*/*','Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'}

def url4forwarder(self, url):
' 生成经过转发器的URL '
return SHARE_FUCK_GFW_SRV % urllib.quote(url)

def getRealUrl (self, url, try_count = 1):
if try_count > 3:
return url
try:
opener = URLOpener(self.host, timeout=self.timeout)
result = opener.open(url, None, self.http_headers)
if result.status_code > 400:
return self.getRealUrl(url, try_count + 1)
else:
return opener.realurl
except:
return self.getRealUrl(url, try_count + 1)

# 提取每个主题页面下所有文章URL
def ParseFeedUrls(self):
Expand All @@ -61,7 +77,11 @@ def ParseFeedUrls(self):
title = item.a.string # 获取文章标题
link = item.a.get('href') # 获取文章链接
link = BaseFeedBook.urljoin("https://toutiao.io", link) # 合成文章链接
# self.log.warn('Fetch article : %s' % link)
link = self.getRealUrl (link)
self.log.warn('Fetch article : %s' % link)
if string.find (link, 'zhihu.com') != -1:
link = self.url4forwarder(url)
self.log.warn('transport : %s' % link)
urls.append((topic, title, link, None)) # 把文章元组加入列表
count = count + 1
if count >= 30 :
Expand All @@ -78,57 +98,60 @@ def preprocess(self, content):
# 将页面内容转换成BeatifulSoup对象
soup = BeautifulSoup(content, 'html.parser')

siteNameTag = soup.find (attrs={"property":"og:site_name"})
if siteNameTag :
siteName = siteNameTag['content']
self.keep_only_tags = []
tag = soup.find (attrs={"property":"og:site_name"})
if tag :
siteName = tag['content']
# 对微信公众号文章做清洗
if siteName and siteName == u'微信公众平台' :
#self.log.warn("it's WeChat article.")
# 需要填充title字段,否则微信公众号文章会没有标题
soup.title.string = soup.find (attrs={"property":"og:title"})['content']
# 清除后面的“喜欢此内容的人还喜欢”
tag = soup.find (name="div", class_="rich_media_area_extra")
if tag :
tag.decompose ()

# 清除微信扫码
tag = soup.find (name="div", class_="qr_code_pc_outer")
if tag :
tag.decompose ()

# 清除文章标签
tag = soup.find (name="div", class_="article-tag_list")
if tag :
tag.decompose ()


# 清除文章的元数据信息
tag = soup.find (name="div", class_="rich_media_meta_list")
if tag :
tag.decompose ()

# 清除打赏信息
tag = soup.find (name="div", id="js_reward_area")
if tag :
tag.decompose ()

# 清除工具条信息
tag = soup.find (name="div", class_="rich_media_tool")
if tag :
tag.decompose ()

# 清除隐藏信息
tags = soup.find_all (name="div", style="display:none;")
for tag in tags:
tag.decompose ()

tags = soup.find_all (name="div", style="display: none;")
for tag in tags:
tag.decompose ()

# 返回预处理完成的内容
return unicode(soup)

if siteName:
if siteName == u'微信公众平台' :
#self.log.warn("it's WeChat article.")
# 需要填充title字段,否则微信公众号文章会没有标题
soup.title.string = soup.find (attrs={"property":"og:title"})['content']

self.keep_only_tags = [
dict(name='div', id="img-content", class_='rich_media_wrp'),
dict(name='div', id="js_content", class_='rich_media_content'),
]

# 清除隐藏信息
tags = soup.find_all (name="div", style="display:none;")
for tag in tags:
tag.decompose ()

tags = soup.find_all (name="div", style="display: none;")
for tag in tags:
tag.decompose ()

return unicode(soup)

# 处理ThoughtWorks洞见文章
elif siteName == u'ThoughtWorks洞见':
self.keep_only_tags = [
dict(name='div', class_='entry-wrap'),
]
return content

# 处理codingstyle文章
tag = soup.find (name='link', rel='alternate')
if tag :
herfLink = tag['href']
if herfLink and string.find (herfLink, 'codingstyle') != -1:
self.keep_only_tags = [
dict(name='div', class_='topic-detail'),
]
return content

# 处理开发者头条文章
title = soup.title.string
if title and string.find (title, u'开发者头条') != -1 :
self.keep_only_tags = [
dict(name='div', class_='content'),
dict(name='div', class_='preview'),
]
return content

return content



Binary file added images/cv_ckxx.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 094dddf

Please sign in to comment.