From bcacd74c7daaf49e755f8f3e564f8c066c2df072 Mon Sep 17 00:00:00 2001 From: Doiiars Date: Wed, 17 Jan 2024 14:55:57 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E8=87=AA=E5=AE=9A=E4=B9=89=E9=80=89?= =?UTF-8?q?=E9=A1=B9=E3=80=81=E5=B9=B6=E5=8F=91=E3=80=81=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新功能: - 通过标题模糊搜索ISBN号 - 自定义并发数 - 自定义结果上限 - 增加特殊标识符nlchash 优化: 优化分类号处理 额外: 增加赞助图片 --- README.md | 13 +++- __init__.py | 208 +++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 185 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 1cc8665..3199628 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,9 @@ - **自动元数据检索**:通过ISBN,自动从中国国家图书馆获取图书元数据。 - **支持中图分类号**:目前唯一能获取中图分类号的Calibre插件。 +- **通过标题模糊搜索ISBN号**:通过标题,自动从中国国家图书馆获取ISBN号。 +- **自定义并发数**:用户可自定义的并发数。 +- **自定义结果上限**:用户可自定义模糊搜索时,返回结果的上限。 ## 🌟返回结果示例 ![image](https://github.com/DoiiarX/NLCISBNPlugin/assets/25550075/e6906459-0457-4c8c-a872-d7eda2d8beff) @@ -46,9 +49,15 @@ 以下是我们计划在未来添加到插件中的功能: -- [ ] **更多定制选项**:增加用户可自定义的选项,以提供更多灵活性。 - [ ] **更好的标题处理**:更好的标题处理。 -- [ ] **模糊搜索**:根据书名模糊搜索。根据isbn搜索isbn相同的多本书籍。 +- [ ] **更好的并发优化**:更好的并发优化,减少被封IP的几率并且增加获取效率。 +- [ ] **模糊搜索**:根据isbn搜索isbn相同的多本书籍。 + +## ❤ 赞助 Donation +如果你觉得本项目对你有帮助,请考虑赞助本项目,以激励我投入更多的时间进行维护与开发。 If you find this project helpful, please consider supporting the project going forward. Your support is greatly appreciated. + + +**你的`star`或者`赞助`是我长期维护此项目的动力所在,由衷感谢每一位支持者,“每一次你花的钱都是在为你想要的世界投票”。 另外,将本项目推荐给更多的人,也是一种支持的方式,用的人越多更新的动力越足。** ## 👤游客访问

diff --git a/__init__.py b/__init__.py index 2b1bed4..8ec7ab7 100644 --- a/__init__.py +++ b/__init__.py @@ -1,8 +1,14 @@ -from calibre.ebooks.metadata.sources.base import Source +from calibre.ebooks.metadata.sources.base import Source, Option from calibre.ebooks.metadata import MetaInformation import re import urllib.request from bs4 import BeautifulSoup +import urllib.parse +from datetime import datetime +from concurrent.futures import ThreadPoolExecutor, as_completed +import time +import hashlib +from random import randint # 常量定义:URL 和头信息 BASE_URL = "http://opac.nlc.cn/F" @@ -10,6 +16,10 @@ SEARCH_URL_TEMPLATE = BASE_URL + "?func=find-b&find_code=ISB&request={isbn}&local_base=NLC01" + \ "&filter_code_1=WLN&filter_request_1=&filter_code_2=WYR&filter_request_2=" + \ "&filter_code_3=WYR&filter_request_3=&filter_code_4=WFM&filter_request_4=&filter_code_5=WSL&filter_request_5=" +SEARCH_URL_TEMPLATE_TITLE = BASE_URL + "?func=find-b&find_code=WTP&request={title}&local_base=NLC01" + \ + "&filter_code_1=WLN&filter_request_1=&filter_code_2=WYR&filter_request_2=" + \ + "&filter_code_3=WYR&filter_request_3=&filter_code_4=WFM&filter_request_4=&filter_code_5=WSL&filter_request_5=" + HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Encoding': 'gzip, deflate', @@ -22,25 +32,114 @@ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0' } +MAX_WORKERS = 2 +MAX_TITLE_LIST_NUM = 6 + +def extract_data_info(html): + pattern = r"第\s+(\d+)\s+条记录\(共\s+(\d+)\s+条\)" + match = re.search(pattern, html) + if match: + current_record, total_records = match.groups() + return int(current_record), int(total_records) + else: + return None, None + +def hash_utf8_string(input_string): + # 将字符串编码为UTF-8 + encoded_string = input_string.encode('utf-8') + + # 使用md5算法计算哈希值 + hasher = hashlib.md5() + hasher.update(encoded_string) + + # 返回十六进制格式的哈希值 + return hasher.hexdigest() + def get_dynamic_url(log): ''' 从基础页面获取动态URL。 :param log: 日志记录器。 :return: 动态URL或None(获取失败时)。 ''' + + response = urllib.request.urlopen(urllib.request.Request(BASE_URL, headers=HEADERS), timeout=10) + response_text = response.read().decode('utf-8') + dynamic_url_match = re.search(r"http://opac.nlc.cn:80/F/[^\s?]*", response_text) + if dynamic_url_match: + dynamic_url = dynamic_url_match.group(0) + return dynamic_url + else: + raise ValueError("无法找到动态URL") + +def title2metadata(title, log, result_queue, clean_downloaded_metadata, max_workers=MAX_WORKERS, max_title_list_num=MAX_TITLE_LIST_NUM ): + if not isinstance(title, str): + raise TypeError("title必须是字符串") + + title = urllib.parse.quote(f"{title}") + dynamic_url = get_dynamic_url(log) + if not dynamic_url: + return None + + search_url = SEARCH_URL_TEMPLATE_TITLE.format(title=title) + + response = urllib.request.urlopen(urllib.request.Request(search_url, headers=HEADERS), timeout=10) + + response_text = response.read().decode('utf-8') + + titlelist = parse_search_list(response_text, log) + + sleep_time = 0 + sleep_time += randint(3,30) + time.sleep(sleep_time/100) + + if len(titlelist)>MAX_TITLE_LIST_NUM: + titlelist = titlelist[:MAX_TITLE_LIST_NUM] + # 使用线程池处理并发请求 + metadatas = [] + with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + future_to_url = {executor.submit(url2metadata, item[1], log, result_queue, clean_downloaded_metadata, max_workers= MAX_WORKERS, max_title_list_num= MAX_TITLE_LIST_NUM): item for item in titlelist} + for future in as_completed(future_to_url): + data = future.result() + if data: + metadatas.append(data) + return metadatas + +def url2metadata(url, log, result_queue, clean_downloaded_metadata, max_workers= MAX_WORKERS, max_title_list_num= MAX_TITLE_LIST_NUM ): + if not isinstance(url, str): + raise TypeError("url必须是字符串") + search_url = url + + sleep_time = 0 + for _ in range(8): + sleep_time += randint(4, 120) + time.sleep(sleep_time/100) + try: - response = urllib.request.urlopen(urllib.request.Request(BASE_URL, headers=HEADERS), timeout=10) + response = urllib.request.urlopen(urllib.request.Request(search_url, headers=HEADERS), timeout=10) response_text = response.read().decode('utf-8') - dynamic_url_match = re.search(r"http://opac.nlc.cn:80/F/[^\s?]*", response_text) - if dynamic_url_match: - dynamic_url = dynamic_url_match.group(0) - return dynamic_url - else: - raise ValueError("无法找到动态URL") - except Exception as e: - log(f"获取动态URL时出错: {e}") + metadata = to_metadata(get_parse_metadata(response_text, None, log), False, log) + clean_downloaded_metadata(metadata) + result_queue.put(metadata) + return metadata + except: return None +def parse_search_list(html, log): + soup = BeautifulSoup(html, "html.parser") + titlelist = [] + # 找到所有class为itemtitle的

元素 + itemtitle_elements = soup.find_all('div', class_='itemtitle') + + # 遍历每个匹配的元素 + for itemtitle_element in itemtitle_elements: + # 获取文本内容 + itemtitle = itemtitle_element.get_text() + + # 找到链接标签并获取其href属性值 + link = itemtitle_element.find('a')['href'] + titlelist.append([itemtitle,link]) + return titlelist + def isbn2meta(isbn, log): ''' 将ISBN转换为元数据。 @@ -67,13 +166,12 @@ def isbn2meta(isbn, log): return None search_url = SEARCH_URL_TEMPLATE.format(isbn=isbn) - try: - response = urllib.request.urlopen(urllib.request.Request(search_url, headers=HEADERS), timeout=10) - response_text = response.read().decode('utf-8') - return parse_metadata(response_text, isbn, log) - except Exception as e: - log(f"获取元数据时出错: {e}") - return None + + response = urllib.request.urlopen(urllib.request.Request(search_url, headers=HEADERS), timeout=10) + response_text = response.read().decode('utf-8') + parse_metadata = get_parse_metadata(response_text, isbn, log) + metadata = to_metadata(parse_metadata, False, log) + return metadata def parse_isbn(html, log): ''' @@ -93,17 +191,19 @@ def parse_isbn(html, log): # 如果找到匹配项,则将ISBN保存到isbn变量中,否则记录未找到的信息 if isbn_matches: isbn = isbn_matches.group(1) - isbn = '978-'+isbn + isbn = isbn.replace('-','') + if len(isbn) == 10: + isbn = '978'+isbn else: log(f'未找到ISBN号') isbn = '' # 记录找到的或未找到的ISBN号,并返回结果 log(f'解析得到的ISBN号: {isbn}') - return isbn.replace('-','') + return isbn -def parse_metadata(html, isbn, log): +def get_parse_metadata(html, isbn, log): ''' 从BeautifulSoup对象中解析元数据。 :param soup: BeautifulSoup对象。 @@ -117,7 +217,9 @@ def parse_metadata(html, isbn, log): data = {} prev_td1 = '' prev_td2 = '' - + data.update({'isbn': isbn}) + data.update({web_isbn: web_isbn}) + try: table = soup.find("table", attrs={"id": "td"}) if not table: @@ -163,8 +265,7 @@ def parse_metadata(html, isbn, log): 'authors': data.get("著者", "").split(' & '), "isbn": data.get(f"{web_isbn}", f"{isbn}") } - - return to_metadata(metadata, False, log) + return metadata def to_metadata(book, add_translator_to_author, log): ''' @@ -178,7 +279,9 @@ def to_metadata(book, add_translator_to_author, log): authors = (book['authors'] + book['translators'] ) if add_translator_to_author and book.get('translators', None) else book['authors'] mi = MetaInformation(book['title'], authors) - mi.identifiers = {PROVIDER_ID: book['isbn']} + mi.identifiers = {PROVIDER_ID: book.get('isbn', ''), + 'nlchash': f"{hash_utf8_string(book['title']+book.get('pubdate', None))}" + } # mi.url = book['url'] # mi.cover = book.get('cover', None) mi.publisher = book['publisher'] @@ -201,22 +304,59 @@ class NLCISBNPlugin(Source): name = '国家图书馆ISBN插件' description = '使用ISBN从中国国家图书馆获取元数据的Calibre插件。' supported_platforms = ['windows', 'osx', 'linux'] - version = (1, 1, 0) + version = (1, 2, 0) author = 'Doiiars' - capabilities = frozenset(['tags', 'identify', 'comments', 'pubdate']) + capabilities = frozenset(['identify']) + touched_fields = frozenset( + ['pubdate', 'tags', 'identify', + 'comments', 'publisher', 'authors', + 'title', 'identifier:'+'nlchash', 'identifier:'+'isbn'] + ) + + options = ( + # name, type, default, label, default, choices + # type 'number', 'string', 'bool', 'choices' + Option( + 'max_workers', 'number', MAX_WORKERS, + _('最大线程数'), + _('爬虫最大线程数。如果过大可能导致用户IP被封锁。') + ), + Option( + 'max_title_list_num', 'number', MAX_TITLE_LIST_NUM, + _('最大返回量'), + _('通过标题搜索时,最多返回多少数据。请求量过多可能因为请求过于频繁被封锁IP。') + ) + ) + + def __init__(self, *args, **kwargs): + Source.__init__(self, *args, **kwargs) def get_book_url(self, identifiers): return None - def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): + def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=60): isbn = identifiers.get('isbn', '') - if not isbn: - return - - metadata = isbn2meta(isbn, log) - if metadata: - result_queue.put(metadata) - + + # 根据isbn获取metadata + metadata = None + if isbn: + metadata = isbn2meta(isbn, log) + log(f"根据isbn获取metadata。") + if metadata: + result_queue.put(metadata) + else: + log(f"未检测到isbn。") + # 根据书名获取metadata + metadata = None + if title: + log(f"根据书名获取metadata") + metadatas = title2metadata(title, log, result_queue, self.clean_downloaded_metadata, + max_title_list_num = self.prefs.get('max_title_list_num'), + max_workers = self.prefs.get('max_workers') + ) + else: + log(f'未检测到title。') + def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): return From aa38fa9a0cfb0961648cebd7d365092a550efdfd Mon Sep 17 00:00:00 2001 From: Doiiars Date: Wed, 17 Jan 2024 14:59:54 +0800 Subject: [PATCH 2/2] Update README.md --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3199628..f0f872a 100644 --- a/README.md +++ b/README.md @@ -54,15 +54,20 @@ - [ ] **模糊搜索**:根据isbn搜索isbn相同的多本书籍。 ## ❤ 赞助 Donation -如果你觉得本项目对你有帮助,请考虑赞助本项目,以激励我投入更多的时间进行维护与开发。 If you find this project helpful, please consider supporting the project going forward. Your support is greatly appreciated. +如果你觉得本项目对你有帮助,请考虑赞助本项目,以激励我投入更多的时间进行维护与开发。 +If you find this project helpful, please consider supporting the project going forward. Your support is greatly appreciated. -**你的`star`或者`赞助`是我长期维护此项目的动力所在,由衷感谢每一位支持者,“每一次你花的钱都是在为你想要的世界投票”。 另外,将本项目推荐给更多的人,也是一种支持的方式,用的人越多更新的动力越足。** + +![Donation](https://github.com/DoiiarX/NLCISBNPlugin/assets/25550075/fe7815a3-d209-4871-938d-dca7af7f67cb) + +**你的`star`或者`赞助`是我长期维护此项目的动力所在,由衷感谢每一位支持者,“每一次你花的钱都是在为你想要的世界投票”。 +另外,将本项目推荐给更多的人,也是一种支持的方式,用的人越多更新的动力越足。** ## 👤游客访问

NLCISBNPlugin -

+

## 🔧安装