-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #24 from traP-jp/tag
- Loading branch information
Showing
4 changed files
with
720 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"id": "initial_id", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-08-18T02:54:00.706752Z", | ||
"start_time": "2024-08-18T02:54:00.680450Z" | ||
} | ||
}, | ||
"source": [ | ||
"import pke\n", | ||
"from pke.lang import stopwords\n", | ||
"from torch.ao.nn.quantized.functional import threshold\n", | ||
"pke.base.ISO_to_languege['ja_ginza'] = 'japanese'\n", | ||
"import ginza\n", | ||
"import nltk\n", | ||
"\n", | ||
"stopwords = list(ginza.STOP_WORDS)" | ||
], | ||
"outputs": [ | ||
{ | ||
"ename": "AttributeError", | ||
"evalue": "module 'pke.base' has no attribute 'ISO_to_languege'", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", | ||
"\u001B[0;31mAttributeError\u001B[0m Traceback (most recent call last)", | ||
"Cell \u001B[0;32mIn[7], line 4\u001B[0m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpke\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mlang\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m stopwords\n\u001B[1;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtorch\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mao\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mnn\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mquantized\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mfunctional\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m threshold\n\u001B[0;32m----> 4\u001B[0m \u001B[43mpke\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbase\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mISO_to_languege\u001B[49m[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mja_ginza\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mjapanese\u001B[39m\u001B[38;5;124m'\u001B[39m\n\u001B[1;32m 5\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mginza\u001B[39;00m\n\u001B[1;32m 6\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mnltk\u001B[39;00m\n", | ||
"\u001B[0;31mAttributeError\u001B[0m: module 'pke.base' has no attribute 'ISO_to_languege'" | ||
] | ||
} | ||
], | ||
"execution_count": 7 | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-08-18T02:53:48.132517Z", | ||
"start_time": "2024-08-18T02:53:48.086435Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": [ | ||
"nltk.corpus.stopwords.words_org = nltk.corpus.stopwords.words \n", | ||
"nltk.corpus.stopwords.words = lambda lang : stopwords if lang == 'japanese' else nltk.corpus.stopwords.words_org(lang)" | ||
], | ||
"id": "e85c1dc1ad8406d5", | ||
"outputs": [ | ||
{ | ||
"ename": "NameError", | ||
"evalue": "name 'nltk' is not defined", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", | ||
"\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)", | ||
"Cell \u001B[0;32mIn[6], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m nltk\u001B[38;5;241m.\u001B[39mcorpus\u001B[38;5;241m.\u001B[39mstopwords\u001B[38;5;241m.\u001B[39mwords_org \u001B[38;5;241m=\u001B[39m \u001B[43mnltk\u001B[49m\u001B[38;5;241m.\u001B[39mcorpus\u001B[38;5;241m.\u001B[39mstopwords\u001B[38;5;241m.\u001B[39mwords \n\u001B[1;32m 2\u001B[0m nltk\u001B[38;5;241m.\u001B[39mcorpus\u001B[38;5;241m.\u001B[39mstopwords\u001B[38;5;241m.\u001B[39mwords \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mlambda\u001B[39;00m lang : stopwords \u001B[38;5;28;01mif\u001B[39;00m lang \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mjapanese\u001B[39m\u001B[38;5;124m'\u001B[39m \u001B[38;5;28;01melse\u001B[39;00m nltk\u001B[38;5;241m.\u001B[39mcorpus\u001B[38;5;241m.\u001B[39mstopwords\u001B[38;5;241m.\u001B[39mwords_org(lang)\n", | ||
"\u001B[0;31mNameError\u001B[0m: name 'nltk' is not defined" | ||
] | ||
} | ||
], | ||
"execution_count": 6 | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"id": "3d92e1849fa3eeda", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-08-18T02:41:51.555118Z", | ||
"start_time": "2024-08-18T02:41:51.549402Z" | ||
} | ||
}, | ||
"source": "text = \"1885年(明治18年)9月に明治政府指定の旧国道8号[注釈 1]として開通した当時は馬車の通行が可能な規格で整備されており、馬車同士がすれ違うこともできたほどの幅員の広い車道で、新潟県側の最奥部付近には短いながらもトンネルが設けられるなど、当時としては破格な高規格の山岳道路だった。しかし開通から程なくして各所で土砂崩れや雪崩などによる路盤決壊や橋の流失が相次ぎ、前述のトンネルも崩壊埋没してしまい、ついには車両通行そのものが完全に不可能となってしまった。その後現在に至るまで車道として再開通させる具体的な計画は立てられていない[21]。群馬県側は、谷川岳ロープウェイ土合口駅前先の谷川岳登山指導センター下(谷川岳山岳資料館前) - 一ノ倉沢(車道終点)の区間で一般車両の通行が通年規制されている(冬季は歩行者を含め通行止め)[22][23]。冬季閉鎖期間外は、みなかみ町が土合口駅前 - 一ノ倉沢間で観光向け電気バスを運行している[22]。その先の一ノ倉沢 - 芝倉沢 - 清水峠 - 居坪坂(井坪坂)分岐の区間は徒歩で通行可能[24]。ただし、峠上にある掘割の部分は現在では登山道として活用されていない。新潟県側の居坪坂[注釈 12]によりバイパスされた区間は、法令上はれっきとした現役の国道であり続けていながら、すでに自然に還っている[24]。この区間では先述のトンネルの崩壊により早い時期から歩行者すら通り抜け不能となっているため、事実上の廃道状態となって100年以上が経過しており、徒歩通行どころか立ち入ることさえも困難なほど壊滅的に荒廃している[21]。代替ルートとして、登山者は清水峠から居坪坂もしくは明治期以前の古道である十五里尾根(謙信尾根)の登山道を経由して清水へ向かうことになる[24]。\"", | ||
"outputs": [], | ||
"execution_count": 3 | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"id": "11643b4055c19a5e", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-08-18T02:41:51.600600Z", | ||
"start_time": "2024-08-18T02:41:51.594726Z" | ||
} | ||
}, | ||
"source": [ | ||
"extractor = pke.unsupervised.MultipartiteRank()" | ||
], | ||
"outputs": [], | ||
"execution_count": 4 | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"id": "cac24a5e1fa71733", | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-08-18T02:44:19.960952Z", | ||
"start_time": "2024-08-18T02:44:05.306619Z" | ||
} | ||
}, | ||
"source": [ | ||
"extractor.load_document(input=text, language='ja', normalization=None)\n", | ||
"extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ', 'NUM'})\n", | ||
"extractor.candidate_weighting(threshold=0.74, method='average', alpha=1.1)" | ||
], | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"WARNING:root:No stoplist available in pke for 'ja' language.\n", | ||
"/home/utfo/.local/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", | ||
" warnings.warn(\n", | ||
"error We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like megagonlabs/transformers-ud-japanese-electra-base-ginza-510 is not the path to a directory containing a file named config.json.\n", | ||
"Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.\n", | ||
"error Can't load the configuration of '/tmp/tmpzmvhaykz/config.json'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/tmp/tmpzmvhaykz/config.json' is the correct path to a directory containing a config.json file\n", | ||
"trying to download model from huggingface hub: megagonlabs/transformers-ud-japanese-electra-base-ginza-510 ...\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"config.json: 0%| | 0.00/815 [00:00<?, ?B/s]" | ||
], | ||
"application/vnd.jupyter.widget-view+json": { | ||
"version_major": 2, | ||
"version_minor": 0, | ||
"model_id": "86f193ccd6804ae59c50baa250f7b1e0" | ||
} | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/home/utfo/.local/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", | ||
" warnings.warn(\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"pytorch_model.bin: 0%| | 0.00/434M [00:00<?, ?B/s]" | ||
], | ||
"application/vnd.jupyter.widget-view+json": { | ||
"version_major": 2, | ||
"version_minor": 0, | ||
"model_id": "50397746e4bf4f8dba72bdf301be1f12" | ||
} | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/home/utfo/.local/lib/python3.10/site-packages/transformers/modeling_utils.py:399: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", | ||
" return torch.load(checkpoint_file, map_location=\"cpu\")\n", | ||
"succeded\n", | ||
"/home/utfo/.local/lib/python3.10/site-packages/thinc/shims/pytorch.py:114: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n", | ||
" with torch.cuda.amp.autocast(self._mixed_precision):\n" | ||
] | ||
} | ||
], | ||
"execution_count": 5 | ||
}, | ||
{ | ||
"metadata": {}, | ||
"cell_type": "code", | ||
"outputs": [], | ||
"execution_count": null, | ||
"source": "", | ||
"id": "3c5534e680e8402" | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"id": "initial_id", | ||
"metadata": { | ||
"collapsed": true, | ||
"ExecuteTime": { | ||
"end_time": "2024-08-18T02:55:38.992709Z", | ||
"start_time": "2024-08-18T02:55:33.690737Z" | ||
} | ||
}, | ||
"source": [ | ||
"import pke\n", | ||
"from nltk import extract" | ||
], | ||
"outputs": [], | ||
"execution_count": 1 | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-08-18T02:55:55.800405Z", | ||
"start_time": "2024-08-18T02:55:55.795722Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": "extractor = pke.unsupervised.TopicRank()", | ||
"id": "44a16eace18b6f62", | ||
"outputs": [], | ||
"execution_count": 2 | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-08-18T02:56:56.989216Z", | ||
"start_time": "2024-08-18T02:56:56.983912Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": "test = \"From the end of the 19th century until the revolutionary events of the year 1917, the land on which the church is situated today belonged to the town residence of the Feodorovsky Gorodetsky monastery of the Nizhny Novgorod diocese.For several decades in the late 19th – early 20th century, thanks to the efforts of the Abbot Archimandrite Alexy (Yakovlev), the town residence gradually increased its presence in Saint-Petersburg. First, an icon case with Feodorovskaya Icon of the Mother of God and an icon of St. Alexander Nevsky was installed in the 3rd class passenger hall of the Nikolayevsky railway station.Then permission was given to erect a stone chapel near the gate of the goods depot. The chapel project was submitted for approval of Emperor Alexander III, and he suggested to build a church instead of the chapel. Thus in 1904, a small church of the Feodorovskaya Icon of the Mother of God and of St. Alexius, Metropolitan of Moscow, was consecrated in memory of the birth of Tsarevich Alexei. It stood approximately in the place where the brick-red 'Kremlin' wall is located today.However, Abbot Alexy was not content and in 1906 he petitioned to expand the territory of the town residence of Gorodetsky monastery for the construction of a new spacious church 'because of the tightness of the Aleksievskaya Church'. By 1909, the concept was 'to construct a stately monument to the successful 300-year reign of the Romanov Dynasty', as stated in the petition addressed to the St. Petersburg Metropolitan bishop. The Construction Committee was headed by major-general of His Majesty's council Dmitri Yakovlevich Dashkov, and worked under the patronage of Grand Duke Michail Alexandrovich, the Emperor's brother.\"", | ||
"id": "5078f559f316e3d", | ||
"outputs": [], | ||
"execution_count": 3 | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-08-18T02:57:38.789685Z", | ||
"start_time": "2024-08-18T02:57:38.016253Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": "extractor.load_document(input=test, language='en')", | ||
"id": "cb21a3f229cd44cf", | ||
"outputs": [], | ||
"execution_count": 4 | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-08-18T02:58:29.618748Z", | ||
"start_time": "2024-08-18T02:58:29.613324Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": "extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ', 'NUM'})", | ||
"id": "640a5727580dfaff", | ||
"outputs": [], | ||
"execution_count": 5 | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-08-18T02:58:45.922268Z", | ||
"start_time": "2024-08-18T02:58:45.645804Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": "extractor.candidate_weighting()", | ||
"id": "9c464fa27bdde79b", | ||
"outputs": [], | ||
"execution_count": 6 | ||
}, | ||
{ | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2024-08-18T02:59:01.172338Z", | ||
"start_time": "2024-08-18T02:59:01.168039Z" | ||
} | ||
}, | ||
"cell_type": "code", | ||
"source": "print(extractor.get_n_best(n=10))", | ||
"id": "3e734bd97157fa0c", | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[('church', 0.0564124514149561), ('icon case', 0.044178464727636035), ('town residence', 0.037832545326898864), ('stone chapel', 0.03494138671719946), ('abbot archimandrite alexy', 0.029273150558366604), ('mother', 0.028585922906481116), ('god', 0.027889292366314496), ('feodorovsky gorodetsky monastery', 0.026892835240459486), ('today', 0.025978609147239463), ('emperor alexander iii', 0.02499490439475357)]\n" | ||
] | ||
} | ||
], | ||
"execution_count": 7 | ||
}, | ||
{ | ||
"metadata": {}, | ||
"cell_type": "code", | ||
"outputs": [], | ||
"execution_count": null, | ||
"source": "", | ||
"id": "f1b6e5d0db83d95" | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.