From bcacd74c7daaf49e755f8f3e564f8c066c2df072 Mon Sep 17 00:00:00 2001
From: Doiiars <doiiars@qq.com>
Date: Wed, 17 Jan 2024 14:55:57 +0800
Subject: [PATCH 1/2] =?UTF-8?q?=E8=87=AA=E5=AE=9A=E4=B9=89=E9=80=89?=
 =?UTF-8?q?=E9=A1=B9=E3=80=81=E5=B9=B6=E5=8F=91=E3=80=81=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

新功能：
- 通过标题模糊搜索ISBN号
- 自定义并发数
- 自定义结果上限
- 增加特殊标识符nlchash

优化：
优化分类号处理

额外：
增加赞助图片
---
 README.md   |  13 +++-
 __init__.py | 208 +++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 185 insertions(+), 36 deletions(-)
diff --git a/README.md b/README.md
index 1cc8665..3199628 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,9 @@
 
 - **自动元数据检索**：通过ISBN，自动从中国国家图书馆获取图书元数据。
 - **支持中图分类号**：目前唯一能获取中图分类号的Calibre插件。
+- **通过标题模糊搜索ISBN号**：通过标题，自动从中国国家图书馆获取ISBN号。
+- **自定义并发数**：用户可自定义的并发数。
+- **自定义结果上限**：用户可自定义模糊搜索时，返回结果的上限。
 
 ## 🌟返回结果示例
 ![image](https://github.com/DoiiarX/NLCISBNPlugin/assets/25550075/e6906459-0457-4c8c-a872-d7eda2d8beff)
@@ -46,9 +49,15 @@
 
 以下是我们计划在未来添加到插件中的功能：
 
-- [ ] **更多定制选项**：增加用户可自定义的选项，以提供更多灵活性。
 - [ ] **更好的标题处理**：更好的标题处理。
-- [ ] **模糊搜索**：根据书名模糊搜索。根据isbn搜索isbn相同的多本书籍。
+- [ ] **更好的并发优化**：更好的并发优化，减少被封IP的几率并且增加获取效率。
+- [ ] **模糊搜索**：根据isbn搜索isbn相同的多本书籍。
+
+## ❤ 赞助 Donation
+如果你觉得本项目对你有帮助，请考虑赞助本项目，以激励我投入更多的时间进行维护与开发。 If you find this project helpful, please consider supporting the project going forward. Your support is greatly appreciated.
+
+
+**你的`star`或者`赞助`是我长期维护此项目的动力所在，由衷感谢每一位支持者，“每一次你花的钱都是在为你想要的世界投票”。 另外，将本项目推荐给更多的人，也是一种支持的方式，用的人越多更新的动力越足。**
 
 ## 👤游客访问
 <p align="center"> 
diff --git a/__init__.py b/__init__.py
index 2b1bed4..8ec7ab7 100644
--- a/__init__.py
+++ b/__init__.py
@@ -1,8 +1,14 @@
-from calibre.ebooks.metadata.sources.base import Source
+from calibre.ebooks.metadata.sources.base import Source, Option
 from calibre.ebooks.metadata import MetaInformation
 import re
 import urllib.request
 from bs4 import BeautifulSoup
+import urllib.parse
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import time
+import hashlib
+from random import randint
 
 # 常量定义：URL 和头信息
 BASE_URL = "http://opac.nlc.cn/F"
@@ -10,6 +16,10 @@
 SEARCH_URL_TEMPLATE = BASE_URL + "?func=find-b&find_code=ISB&request={isbn}&local_base=NLC01" + \
                       "&filter_code_1=WLN&filter_request_1=&filter_code_2=WYR&filter_request_2=" + \
                       "&filter_code_3=WYR&filter_request_3=&filter_code_4=WFM&filter_request_4=&filter_code_5=WSL&filter_request_5="
+SEARCH_URL_TEMPLATE_TITLE = BASE_URL + "?func=find-b&find_code=WTP&request={title}&local_base=NLC01" + \
+                      "&filter_code_1=WLN&filter_request_1=&filter_code_2=WYR&filter_request_2=" + \
+                      "&filter_code_3=WYR&filter_request_3=&filter_code_4=WFM&filter_request_4=&filter_code_5=WSL&filter_request_5="
+
 HEADERS = {
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
     'Accept-Encoding': 'gzip, deflate',
@@ -22,25 +32,114 @@
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
 }
 
+MAX_WORKERS = 2
+MAX_TITLE_LIST_NUM = 6
+
+def extract_data_info(html):
+    pattern = r"第\s+(\d+)\s+条记录\(共\s+(\d+)\s+条\)"
+    match = re.search(pattern, html)
+    if match:
+        current_record, total_records = match.groups()
+        return int(current_record), int(total_records)
+    else:
+        return None, None
+
+def hash_utf8_string(input_string):
+    # 将字符串编码为UTF-8
+    encoded_string = input_string.encode('utf-8')
+    
+    # 使用md5算法计算哈希值
+    hasher = hashlib.md5()
+    hasher.update(encoded_string)
+    
+    # 返回十六进制格式的哈希值
+    return hasher.hexdigest()
+
 def get_dynamic_url(log):
     '''
     从基础页面获取动态URL。
     :param log: 日志记录器。
     :return: 动态URL或None（获取失败时）。
     '''
+    
+    response = urllib.request.urlopen(urllib.request.Request(BASE_URL, headers=HEADERS), timeout=10)
+    response_text = response.read().decode('utf-8')
+    dynamic_url_match = re.search(r"http://opac.nlc.cn:80/F/[^\s?]*", response_text)
+    if dynamic_url_match:
+        dynamic_url = dynamic_url_match.group(0)
+        return dynamic_url
+    else:
+        raise ValueError("无法找到动态URL")
+
+def title2metadata(title, log, result_queue, clean_downloaded_metadata, max_workers=MAX_WORKERS, max_title_list_num=MAX_TITLE_LIST_NUM ):
+    if not isinstance(title, str):
+        raise TypeError("title必须是字符串")
+    
+    title = urllib.parse.quote(f"{title}")
+    dynamic_url = get_dynamic_url(log)
+    if not dynamic_url:
+        return None
+
+    search_url = SEARCH_URL_TEMPLATE_TITLE.format(title=title)
+    
+    response = urllib.request.urlopen(urllib.request.Request(search_url, headers=HEADERS), timeout=10)
+
+    response_text = response.read().decode('utf-8')
+
+    titlelist = parse_search_list(response_text, log)
+    
+    sleep_time = 0
+    sleep_time += randint(3,30)
+    time.sleep(sleep_time/100)
+    
+    if len(titlelist)>MAX_TITLE_LIST_NUM:
+        titlelist = titlelist[:MAX_TITLE_LIST_NUM]
+    # 使用线程池处理并发请求
+    metadatas = []
+    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        future_to_url = {executor.submit(url2metadata, item[1], log, result_queue, clean_downloaded_metadata, max_workers= MAX_WORKERS, max_title_list_num= MAX_TITLE_LIST_NUM): item for item in titlelist}
+        for future in as_completed(future_to_url):
+            data = future.result()
+            if data:
+                metadatas.append(data)
+    return metadatas
+
+def url2metadata(url, log, result_queue, clean_downloaded_metadata, max_workers= MAX_WORKERS, max_title_list_num= MAX_TITLE_LIST_NUM ):
+    if not isinstance(url, str):
+        raise TypeError("url必须是字符串")
+    search_url = url
+    
+    sleep_time = 0
+    for _ in range(8):
+        sleep_time += randint(4, 120)
+    time.sleep(sleep_time/100)
+
     try:
-        response = urllib.request.urlopen(urllib.request.Request(BASE_URL, headers=HEADERS), timeout=10)
+        response = urllib.request.urlopen(urllib.request.Request(search_url, headers=HEADERS), timeout=10)
         response_text = response.read().decode('utf-8')
-        dynamic_url_match = re.search(r"http://opac.nlc.cn:80/F/[^\s?]*", response_text)
-        if dynamic_url_match:
-            dynamic_url = dynamic_url_match.group(0)
-            return dynamic_url
-        else:
-            raise ValueError("无法找到动态URL")
-    except Exception as e:
-        log(f"获取动态URL时出错: {e}")
+        metadata = to_metadata(get_parse_metadata(response_text, None, log), False, log)
+        clean_downloaded_metadata(metadata)
+        result_queue.put(metadata)
+        return metadata
+    except:
         return None
 
+def parse_search_list(html, log): 
+    soup = BeautifulSoup(html, "html.parser")
+    titlelist = []
+    # 找到所有class为itemtitle的<div>元素
+    itemtitle_elements = soup.find_all('div', class_='itemtitle')
+
+    # 遍历每个匹配的元素
+    for itemtitle_element in itemtitle_elements:
+        # 获取文本内容
+        itemtitle = itemtitle_element.get_text()
+        
+        # 找到链接<a>标签并获取其href属性值
+        link = itemtitle_element.find('a')['href']
+        titlelist.append([itemtitle,link])
+    return titlelist
+
 def isbn2meta(isbn, log):
     '''
     将ISBN转换为元数据。
@@ -67,13 +166,12 @@ def isbn2meta(isbn, log):
         return None
 
     search_url = SEARCH_URL_TEMPLATE.format(isbn=isbn)
-    try:
-        response = urllib.request.urlopen(urllib.request.Request(search_url, headers=HEADERS), timeout=10)
-        response_text = response.read().decode('utf-8')
-        return parse_metadata(response_text, isbn, log)
-    except Exception as e:
-        log(f"获取元数据时出错: {e}")
-        return None
+    
+    response = urllib.request.urlopen(urllib.request.Request(search_url, headers=HEADERS), timeout=10)
+    response_text = response.read().decode('utf-8')
+    parse_metadata = get_parse_metadata(response_text, isbn, log)
+    metadata = to_metadata(parse_metadata, False, log)
+    return metadata
 
 def parse_isbn(html, log):
     '''
@@ -93,17 +191,19 @@ def parse_isbn(html, log):
     # 如果找到匹配项，则将ISBN保存到isbn变量中，否则记录未找到的信息
     if isbn_matches:
         isbn = isbn_matches.group(1)
-        isbn = '978-'+isbn
+        isbn = isbn.replace('-','')
+        if len(isbn) == 10:
+            isbn = '978'+isbn
     else:
         log(f'未找到ISBN号')
         isbn = ''
     
     # 记录找到的或未找到的ISBN号，并返回结果
     log(f'解析得到的ISBN号: {isbn}')
-    return isbn.replace('-','')
+    return isbn
 
 
-def parse_metadata(html, isbn, log):
+def get_parse_metadata(html, isbn, log):
     '''
     从BeautifulSoup对象中解析元数据。
     :param soup: BeautifulSoup对象。
@@ -117,7 +217,9 @@ def parse_metadata(html, isbn, log):
     data = {}
     prev_td1 = ''
     prev_td2 = ''
-
+    data.update({'isbn': isbn})
+    data.update({web_isbn: web_isbn})
+    
     try:
         table = soup.find("table", attrs={"id": "td"})
         if not table:
@@ -163,8 +265,7 @@ def parse_metadata(html, isbn, log):
         'authors': data.get("著者", "").split(' & '),
         "isbn": data.get(f"{web_isbn}", f"{isbn}")
     }
-
-    return to_metadata(metadata, False, log)
+    return metadata
 
 def to_metadata(book, add_translator_to_author, log):
     '''
@@ -178,7 +279,9 @@ def to_metadata(book, add_translator_to_author, log):
         authors = (book['authors'] + book['translators']
                    ) if add_translator_to_author and book.get('translators', None) else book['authors']
         mi = MetaInformation(book['title'], authors)
-        mi.identifiers = {PROVIDER_ID: book['isbn']}
+        mi.identifiers = {PROVIDER_ID: book.get('isbn', ''),
+                          'nlchash': f"{hash_utf8_string(book['title']+book.get('pubdate', None))}"
+                          }
         # mi.url = book['url']
         # mi.cover = book.get('cover', None)
         mi.publisher = book['publisher']
@@ -201,22 +304,59 @@ class NLCISBNPlugin(Source):
     name = '国家图书馆ISBN插件'
     description = '使用ISBN从中国国家图书馆获取元数据的Calibre插件。'
     supported_platforms = ['windows', 'osx', 'linux']
-    version = (1, 1, 0)
+    version = (1, 2, 0)
     author = 'Doiiars'
-    capabilities = frozenset(['tags', 'identify', 'comments', 'pubdate'])
+    capabilities = frozenset(['identify'])
+    touched_fields = frozenset(
+        ['pubdate', 'tags', 'identify', 
+         'comments', 'publisher', 'authors', 
+         'title', 'identifier:'+'nlchash', 'identifier:'+'isbn']
+        )
+    
+    options = (
+        # name, type, default, label, default, choices
+        # type 'number', 'string', 'bool', 'choices'
+        Option(
+            'max_workers', 'number', MAX_WORKERS,
+            _('最大线程数'),
+            _('爬虫最大线程数。如果过大可能导致用户IP被封锁。')
+        ),
+        Option(
+            'max_title_list_num', 'number', MAX_TITLE_LIST_NUM,
+            _('最大返回量'),
+            _('通过标题搜索时，最多返回多少数据。请求量过多可能因为请求过于频繁被封锁IP。')
+        )
+    )
+    
+    def __init__(self, *args, **kwargs):
+        Source.__init__(self, *args, **kwargs)
 
     def get_book_url(self, identifiers):
         return None
 
-    def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30):
+    def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=60):
         isbn = identifiers.get('isbn', '')
-        if not isbn:
-            return
-
-        metadata = isbn2meta(isbn, log)
-        if metadata:
-            result_queue.put(metadata)
-
+        
+        # 根据isbn获取metadata
+        metadata = None
+        if isbn:
+          metadata = isbn2meta(isbn, log)
+          log(f"根据isbn获取metadata。")
+          if metadata:
+              result_queue.put(metadata)
+        else:
+            log(f"未检测到isbn。")
+            # 根据书名获取metadata
+            metadata = None
+            if title:
+                log(f"根据书名获取metadata")
+                metadatas = title2metadata(title, log, result_queue, self.clean_downloaded_metadata,
+                                            max_title_list_num = self.prefs.get('max_title_list_num'),
+                                            max_workers = self.prefs.get('max_workers')
+                                            )
+            else:
+                log(f'未检测到title。')
+            
     def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
         return
 

From aa38fa9a0cfb0961648cebd7d365092a550efdfd Mon Sep 17 00:00:00 2001
From: Doiiars <doiiars@qq.com>
Date: Wed, 17 Jan 2024 14:59:54 +0800
Subject: [PATCH 2/2] Update README.md

---
 README.md | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3199628..f0f872a 100644
--- a/README.md
+++ b/README.md
@@ -54,15 +54,20 @@
 - [ ] **模糊搜索**：根据isbn搜索isbn相同的多本书籍。
 
 ## ❤ 赞助 Donation
-如果你觉得本项目对你有帮助，请考虑赞助本项目，以激励我投入更多的时间进行维护与开发。 If you find this project helpful, please consider supporting the project going forward. Your support is greatly appreciated.
+如果你觉得本项目对你有帮助，请考虑赞助本项目，以激励我投入更多的时间进行维护与开发。
 
+If you find this project helpful, please consider supporting the project going forward. Your support is greatly appreciated.
 
-**你的`star`或者`赞助`是我长期维护此项目的动力所在，由衷感谢每一位支持者，“每一次你花的钱都是在为你想要的世界投票”。 另外，将本项目推荐给更多的人，也是一种支持的方式，用的人越多更新的动力越足。**
+
+![Donation](https://github.com/DoiiarX/NLCISBNPlugin/assets/25550075/fe7815a3-d209-4871-938d-dca7af7f67cb)
+
+**你的`star`或者`赞助`是我长期维护此项目的动力所在，由衷感谢每一位支持者，“每一次你花的钱都是在为你想要的世界投票”。 
+另外，将本项目推荐给更多的人，也是一种支持的方式，用的人越多更新的动力越足。**
 
 ## 👤游客访问
 <p align="center"> 
    <img alingn="center" src="https://profile-counter.glitch.me/NLCISBNPlugin/count.svg"  alt="NLCISBNPlugin"/>
- </p>
+</p>
 
 ## 🔧安装