Skip to content

Commit

Permalink
Merge pull request #5 from DoiiarX/dev
Browse files Browse the repository at this point in the history
自定义选项、并发、优化
  • Loading branch information
DoiiarX authored Jan 17, 2024
2 parents a32a83c + aa38fa9 commit da51c61
Show file tree
Hide file tree
Showing 2 changed files with 191 additions and 37 deletions.
20 changes: 17 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@

- **自动元数据检索**:通过ISBN,自动从中国国家图书馆获取图书元数据。
- **支持中图分类号**:目前唯一能获取中图分类号的Calibre插件。
- **通过标题模糊搜索ISBN号**:通过标题,自动从中国国家图书馆获取ISBN号。
- **自定义并发数**:用户可自定义的并发数。
- **自定义结果上限**:用户可自定义模糊搜索时,返回结果的上限。

## 🌟返回结果示例
![image](https://github.com/DoiiarX/NLCISBNPlugin/assets/25550075/e6906459-0457-4c8c-a872-d7eda2d8beff)
Expand All @@ -46,14 +49,25 @@

以下是我们计划在未来添加到插件中的功能:

- [ ] **更多定制选项**:增加用户可自定义的选项,以提供更多灵活性。
- [ ] **更好的标题处理**:更好的标题处理。
- [ ] **模糊搜索**:根据书名模糊搜索。根据isbn搜索isbn相同的多本书籍。
- [ ] **更好的并发优化**:更好的并发优化,减少被封IP的几率并且增加获取效率。
- [ ] **模糊搜索**:根据isbn搜索isbn相同的多本书籍。

## ❤ 赞助 Donation
如果你觉得本项目对你有帮助,请考虑赞助本项目,以激励我投入更多的时间进行维护与开发。

If you find this project helpful, please consider supporting the project going forward. Your support is greatly appreciated.


![Donation](https://github.com/DoiiarX/NLCISBNPlugin/assets/25550075/fe7815a3-d209-4871-938d-dca7af7f67cb)

**你的`star`或者`赞助`是我长期维护此项目的动力所在,由衷感谢每一位支持者,“每一次你花的钱都是在为你想要的世界投票”。
另外,将本项目推荐给更多的人,也是一种支持的方式,用的人越多更新的动力越足。**

## 👤游客访问
<p align="center">
<img alingn="center" src="https://profile-counter.glitch.me/NLCISBNPlugin/count.svg" alt="NLCISBNPlugin"/>
</p>
</p>

## 🔧安装

Expand Down
208 changes: 174 additions & 34 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
from calibre.ebooks.metadata.sources.base import Source
from calibre.ebooks.metadata.sources.base import Source, Option
from calibre.ebooks.metadata import MetaInformation
import re
import urllib.request
from bs4 import BeautifulSoup
import urllib.parse
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import hashlib
from random import randint

# 常量定义:URL 和头信息
BASE_URL = "http://opac.nlc.cn/F"
PROVIDER_ID = "isbn"
SEARCH_URL_TEMPLATE = BASE_URL + "?func=find-b&find_code=ISB&request={isbn}&local_base=NLC01" + \
"&filter_code_1=WLN&filter_request_1=&filter_code_2=WYR&filter_request_2=" + \
"&filter_code_3=WYR&filter_request_3=&filter_code_4=WFM&filter_request_4=&filter_code_5=WSL&filter_request_5="
SEARCH_URL_TEMPLATE_TITLE = BASE_URL + "?func=find-b&find_code=WTP&request={title}&local_base=NLC01" + \
"&filter_code_1=WLN&filter_request_1=&filter_code_2=WYR&filter_request_2=" + \
"&filter_code_3=WYR&filter_request_3=&filter_code_4=WFM&filter_request_4=&filter_code_5=WSL&filter_request_5="

HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
Expand All @@ -22,25 +32,114 @@
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}

MAX_WORKERS = 2
MAX_TITLE_LIST_NUM = 6

def extract_data_info(html):
pattern = r"第\s+(\d+)\s+条记录\(共\s+(\d+)\s+条\)"
match = re.search(pattern, html)
if match:
current_record, total_records = match.groups()
return int(current_record), int(total_records)
else:
return None, None

def hash_utf8_string(input_string):
# 将字符串编码为UTF-8
encoded_string = input_string.encode('utf-8')

# 使用md5算法计算哈希值
hasher = hashlib.md5()
hasher.update(encoded_string)

# 返回十六进制格式的哈希值
return hasher.hexdigest()

def get_dynamic_url(log):
'''
从基础页面获取动态URL。
:param log: 日志记录器。
:return: 动态URL或None(获取失败时)。
'''

response = urllib.request.urlopen(urllib.request.Request(BASE_URL, headers=HEADERS), timeout=10)
response_text = response.read().decode('utf-8')
dynamic_url_match = re.search(r"http://opac.nlc.cn:80/F/[^\s?]*", response_text)
if dynamic_url_match:
dynamic_url = dynamic_url_match.group(0)
return dynamic_url
else:
raise ValueError("无法找到动态URL")

def title2metadata(title, log, result_queue, clean_downloaded_metadata, max_workers=MAX_WORKERS, max_title_list_num=MAX_TITLE_LIST_NUM ):
if not isinstance(title, str):
raise TypeError("title必须是字符串")

title = urllib.parse.quote(f"{title}")
dynamic_url = get_dynamic_url(log)
if not dynamic_url:
return None

search_url = SEARCH_URL_TEMPLATE_TITLE.format(title=title)

response = urllib.request.urlopen(urllib.request.Request(search_url, headers=HEADERS), timeout=10)

response_text = response.read().decode('utf-8')

titlelist = parse_search_list(response_text, log)

sleep_time = 0
sleep_time += randint(3,30)
time.sleep(sleep_time/100)

if len(titlelist)>MAX_TITLE_LIST_NUM:
titlelist = titlelist[:MAX_TITLE_LIST_NUM]
# 使用线程池处理并发请求
metadatas = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
future_to_url = {executor.submit(url2metadata, item[1], log, result_queue, clean_downloaded_metadata, max_workers= MAX_WORKERS, max_title_list_num= MAX_TITLE_LIST_NUM): item for item in titlelist}
for future in as_completed(future_to_url):
data = future.result()
if data:
metadatas.append(data)
return metadatas

def url2metadata(url, log, result_queue, clean_downloaded_metadata, max_workers= MAX_WORKERS, max_title_list_num= MAX_TITLE_LIST_NUM ):
if not isinstance(url, str):
raise TypeError("url必须是字符串")
search_url = url

sleep_time = 0
for _ in range(8):
sleep_time += randint(4, 120)
time.sleep(sleep_time/100)

try:
response = urllib.request.urlopen(urllib.request.Request(BASE_URL, headers=HEADERS), timeout=10)
response = urllib.request.urlopen(urllib.request.Request(search_url, headers=HEADERS), timeout=10)
response_text = response.read().decode('utf-8')
dynamic_url_match = re.search(r"http://opac.nlc.cn:80/F/[^\s?]*", response_text)
if dynamic_url_match:
dynamic_url = dynamic_url_match.group(0)
return dynamic_url
else:
raise ValueError("无法找到动态URL")
except Exception as e:
log(f"获取动态URL时出错: {e}")
metadata = to_metadata(get_parse_metadata(response_text, None, log), False, log)
clean_downloaded_metadata(metadata)
result_queue.put(metadata)
return metadata
except:
return None

def parse_search_list(html, log):
soup = BeautifulSoup(html, "html.parser")
titlelist = []
# 找到所有class为itemtitle的<div>元素
itemtitle_elements = soup.find_all('div', class_='itemtitle')

# 遍历每个匹配的元素
for itemtitle_element in itemtitle_elements:
# 获取文本内容
itemtitle = itemtitle_element.get_text()

# 找到链接<a>标签并获取其href属性值
link = itemtitle_element.find('a')['href']
titlelist.append([itemtitle,link])
return titlelist

def isbn2meta(isbn, log):
'''
将ISBN转换为元数据。
Expand All @@ -67,13 +166,12 @@ def isbn2meta(isbn, log):
return None

search_url = SEARCH_URL_TEMPLATE.format(isbn=isbn)
try:
response = urllib.request.urlopen(urllib.request.Request(search_url, headers=HEADERS), timeout=10)
response_text = response.read().decode('utf-8')
return parse_metadata(response_text, isbn, log)
except Exception as e:
log(f"获取元数据时出错: {e}")
return None

response = urllib.request.urlopen(urllib.request.Request(search_url, headers=HEADERS), timeout=10)
response_text = response.read().decode('utf-8')
parse_metadata = get_parse_metadata(response_text, isbn, log)
metadata = to_metadata(parse_metadata, False, log)
return metadata

def parse_isbn(html, log):
'''
Expand All @@ -93,17 +191,19 @@ def parse_isbn(html, log):
# 如果找到匹配项,则将ISBN保存到isbn变量中,否则记录未找到的信息
if isbn_matches:
isbn = isbn_matches.group(1)
isbn = '978-'+isbn
isbn = isbn.replace('-','')
if len(isbn) == 10:
isbn = '978'+isbn
else:
log(f'未找到ISBN号')
isbn = ''

# 记录找到的或未找到的ISBN号,并返回结果
log(f'解析得到的ISBN号: {isbn}')
return isbn.replace('-','')
return isbn


def parse_metadata(html, isbn, log):
def get_parse_metadata(html, isbn, log):
'''
从BeautifulSoup对象中解析元数据。
:param soup: BeautifulSoup对象。
Expand All @@ -117,7 +217,9 @@ def parse_metadata(html, isbn, log):
data = {}
prev_td1 = ''
prev_td2 = ''

data.update({'isbn': isbn})
data.update({web_isbn: web_isbn})

try:
table = soup.find("table", attrs={"id": "td"})
if not table:
Expand Down Expand Up @@ -163,8 +265,7 @@ def parse_metadata(html, isbn, log):
'authors': data.get("著者", "").split(' & '),
"isbn": data.get(f"{web_isbn}", f"{isbn}")
}

return to_metadata(metadata, False, log)
return metadata

def to_metadata(book, add_translator_to_author, log):
'''
Expand All @@ -178,7 +279,9 @@ def to_metadata(book, add_translator_to_author, log):
authors = (book['authors'] + book['translators']
) if add_translator_to_author and book.get('translators', None) else book['authors']
mi = MetaInformation(book['title'], authors)
mi.identifiers = {PROVIDER_ID: book['isbn']}
mi.identifiers = {PROVIDER_ID: book.get('isbn', ''),
'nlchash': f"{hash_utf8_string(book['title']+book.get('pubdate', None))}"
}
# mi.url = book['url']
# mi.cover = book.get('cover', None)
mi.publisher = book['publisher']
Expand All @@ -201,22 +304,59 @@ class NLCISBNPlugin(Source):
name = '国家图书馆ISBN插件'
description = '使用ISBN从中国国家图书馆获取元数据的Calibre插件。'
supported_platforms = ['windows', 'osx', 'linux']
version = (1, 1, 0)
version = (1, 2, 0)
author = 'Doiiars'
capabilities = frozenset(['tags', 'identify', 'comments', 'pubdate'])
capabilities = frozenset(['identify'])
touched_fields = frozenset(
['pubdate', 'tags', 'identify',
'comments', 'publisher', 'authors',
'title', 'identifier:'+'nlchash', 'identifier:'+'isbn']
)

options = (
# name, type, default, label, default, choices
# type 'number', 'string', 'bool', 'choices'
Option(
'max_workers', 'number', MAX_WORKERS,
_('最大线程数'),
_('爬虫最大线程数。如果过大可能导致用户IP被封锁。')
),
Option(
'max_title_list_num', 'number', MAX_TITLE_LIST_NUM,
_('最大返回量'),
_('通过标题搜索时,最多返回多少数据。请求量过多可能因为请求过于频繁被封锁IP。')
)
)

def __init__(self, *args, **kwargs):
Source.__init__(self, *args, **kwargs)

def get_book_url(self, identifiers):
return None

def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30):
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=60):
isbn = identifiers.get('isbn', '')
if not isbn:
return

metadata = isbn2meta(isbn, log)
if metadata:
result_queue.put(metadata)


# 根据isbn获取metadata
metadata = None
if isbn:
metadata = isbn2meta(isbn, log)
log(f"根据isbn获取metadata。")
if metadata:
result_queue.put(metadata)
else:
log(f"未检测到isbn。")
# 根据书名获取metadata
metadata = None
if title:
log(f"根据书名获取metadata")
metadatas = title2metadata(title, log, result_queue, self.clean_downloaded_metadata,
max_title_list_num = self.prefs.get('max_title_list_num'),
max_workers = self.prefs.get('max_workers')
)
else:
log(f'未检测到title。')

def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
return

Expand Down

0 comments on commit da51c61

Please sign in to comment.