diff --git a/README.md b/README.md index 92105ac..51e6a46 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,11 @@ The score here is derived from the term [tf-idf](https://en.wikipedia.org/wiki/T ### Word cloud -Here is a [wordcloud](https://github.com/datasciencecampus/patent_app_detect/output/wordclouds/wordcloud_tech.png) using the Y02 classification on a 10,000 random sample of patents. The greater the tf-idf score, the larger the font size of the term. +Here is a [wordcloud](https://raw.githubusercontent.com/datasciencecampus/patent_app_detect/master/outputs/wordclouds/wordcloud_tech.png) using the Y02 classification on a 10,000 random sample of patents. The greater the tf-idf score, the larger the font size of the term. ### Force directed graph -This output provides an [interactive graph](https://github.com/datasciencecampus/patent_app_detect/outputs/fdg/index.html) that shows connections between terms that are generally found in the same patent documents. This example was run for the Y02 classification on a 10,000 random sample of patents. +This output provides an interactive graph in the to be viewed in a web browser (you need to locally open the file ```outputs/fdg/index.html```). The graph shows connections between terms that are generally found in the same patent documents. The example wordcloud in the ```outputs/fdg``` folder was created using the Y02 classification on a 10,000 random sample of patents. ## How to install @@ -82,6 +82,27 @@ python detect.py -ps=USPTO-random-10000 Will run the tool for a pre-created random dataset of 10,000 patents. +### Additional patent sources + +Patent datasets are stored in the sub-folder ```data```, we have supplied the following files: +- ```USPTO-random-100.pkl.bz2``` +- ```USPTO-random-1000.pkl.bz2``` +- ```USPTO-random-10000.pkl.bz2``` +- ```USPTO-random-100000.pkl.bz2``` +- ```USPTO-random-500000.pkl.bz2``` + +The command ```python detect.py -ps=USPTO-random-10000``` instructs the program to load a pickled data frame of patents +from a file located in ```data/USPTO-random-10000.pkl.bz2```. Hence ```-ps=NAME``` looks for ```data/NAME.pkl.bz2```. + +We have hosted larger datasets on a google drive, as the files are too large for GitHub version control. We have made available: +- All USPTO patents from 2004 (477Mb): [USPTO-all.pkl.bz2](https://drive.google.com/drive/folders/1d47pizWdKqtORS1zoBzsk3tLk6VZZA4N) + +To use additional files, follow the link and download the pickle file into the data folder. Access the new data +with ```-ps=NameWithoutFileExtension```; for example, ```USPTO-all.pkl.bz2``` would be loaded with ```-ps=USPTO-all```. + +Note that large datasets will require a large amount of system memory (such as 64Gb), otherwise it will process very slowly +as virtual memory (swap) is very likely to be used. + ### Choosing CPC classification This subsets the chosen patents dataset to a particular Cooperative Patent Classification (CPC) class, for example Y02. The Y02 classification is for "technologies or applications for mitigation or adaptation against climate change". In this case a larger patent dataset is generally required to allow for the reduction in patent numbers after subsetting. An example script is: @@ -216,3 +237,9 @@ optional arguments: the desired cpc classification ``` + +## Acknowledgements + +### Patent data + +Patent data was obtained from the [United States Patent and Trademark Office (USPTO)](https://www.uspto.gov) through the [Bulk Data Storage System (BDSS)](https://bulkdata.uspto.gov). In particular we used the `Patent Grant Full Text Data/APS (JAN 1976 - PRESENT)` dataset, using the data from 2004 onwards in XML 4.* format. diff --git a/detect.py b/detect.py index 38682d8..5085896 100644 --- a/detect.py +++ b/detect.py @@ -109,7 +109,7 @@ def get_tfidf(args, filename, cpc): def main(): - paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'json'), os.path.join('outputs', 'wordclouds')] + paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'wordclouds')] for path in paths: os.makedirs(path, exist_ok=True) diff --git a/outputs/fdg/empty.json b/outputs/fdg/empty.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/outputs/fdg/empty.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/outputs/fdg/f.js b/outputs/fdg/f.js index a9a3927..d50d0a5 100644 --- a/outputs/fdg/f.js +++ b/outputs/fdg/f.js @@ -1,10 +1,7 @@ -var dataURL = "http://mysafeinfo.com/api/data?list=englishmonarchs&format=json"; - +var dataURL = "https://raw.githubusercontent.com/datasciencecampus/patent_app_detect/master/outputs/fdg/empty.json"; var refresh = function(data){ - - var json_obj = JSON.parse(data); var svg = d3.select("svg"), width = +svg.attr("width"), @@ -25,40 +22,20 @@ d3.select("div#chartId") //class to make it responsive .classed("svg-content-responsive", true); -//var container = d3.select('body').append('div') -// .attr('id','container') -//; -// -//// svg#sky -//var sky = container.append('svg') -// //.attr('height', 100) -// //.attr('width', 100) -// .attr('id', 'sky') -//; - var color = d3.scaleOrdinal(d3.schemeCategory20c); -//var nodeRadius = 20; var padding = 1, // separation between circles radius=6; - - var simulation = d3.forceSimulation() .force("link", d3.forceLink().id(function(d) { return d.text; }).distance(300)) .force("charge", d3.forceManyBody().strength(-100)) .force("center", d3.forceCenter(width / 2, height / 2)) - //.force("gravity", 0.05) - //.force("linkDistance", 50) - //.force("size", [9000, 6000]) .force("collide", d3.forceCollide().radius(function(d) { return 12*radius + padding; }).iterations(40)) - - - d3.json(dataURL, function(error, graph) { if (error) throw error; @@ -72,7 +49,6 @@ d3.json(dataURL, function(error, graph) { .data(graph.links) .enter().append("line").attr("stroke-width", function(d) { return (8*d.size); - //Math.sqrt(1.5*d.size); }); var node = svg.append("g") @@ -119,7 +95,6 @@ d3.json(dataURL, function(error, graph) { .text(function(d) { return d.text }); - simulation .nodes(graph.nodes) diff --git a/outputs/fdg/index.html b/outputs/fdg/index.html index 707e4b1..56efa59 100644 --- a/outputs/fdg/index.html +++ b/outputs/fdg/index.html @@ -4,5 +4,5 @@ - + diff --git a/outputs/fdg/key-terms.js b/outputs/fdg/key-terms.js new file mode 100644 index 0000000..a2da386 --- /dev/null +++ b/outputs/fdg/key-terms.js @@ -0,0 +1 @@ +data = '[{"nodes": [{"text": "bottom surface", "freq": 0.6726052091113843}, {"text": "circuit board", "freq": 0.7063039273398004}, {"text": "communication device", "freq": 0.8441174450907452}, {"text": "communication system", "freq": 0.6841318043798694}, {"text": "composition comprising", "freq": 0.6989498037779591}, {"text": "computer program", "freq": 0.5944663796903795}, {"text": "computer system", "freq": 0.8236626514636429}, {"text": "control signal", "freq": 0.608818068396678}, {"text": "control system", "freq": 0.7092940998581272}, {"text": "control unit", "freq": 0.7006086451058384}, {"text": "crystal display", "freq": 0.735757252300935}, {"text": "data processing", "freq": 0.5625517519391339}, {"text": "device comprises", "freq": 0.6798825701962133}, {"text": "display device", "freq": 0.6699163987387577}, {"text": "electronic device", "freq": 1.0836145828560166}, {"text": "first layer", "freq": 0.6696081758173493}, {"text": "first signal", "freq": 0.6153414407663305}, {"text": "first surface", "freq": 0.4804165912304299}, {"text": "gate electrode", "freq": 0.6856552419742178}, {"text": "host computer", "freq": 0.7524889882464381}, {"text": "image data", "freq": 0.6588083301554957}, {"text": "inner surface", "freq": 0.5339913211938395}, {"text": "insulating layer", "freq": 0.5796244292088581}, {"text": "integrated circuit", "freq": 0.7369670722710928}, {"text": "internal combustion", "freq": 0.5505976390221616}, {"text": "internal combustion engine", "freq": 0.5505976390221616}, {"text": "layer formed", "freq": 0.593164590717632}, {"text": "least partially", "freq": 0.6625063974987224}, {"text": "light emitting", "freq": 0.8299031005427082}, {"text": "light source", "freq": 1.124916561762969}, {"text": "liquid crystal", "freq": 0.9408618507563702}, {"text": "liquid crystal display", "freq": 0.7180726385545779}, {"text": "longitudinal axis", "freq": 0.701642649124472}, {"text": "memory cell", "freq": 0.813200268633091}, {"text": "memory device", "freq": 0.6680810754385427}, {"text": "nucleic acid", "freq": 0.7956662195388315}, {"text": "pharmaceutical composition", "freq": 0.7732320560420489}, {"text": "plant produced", "freq": 0.5083491104900089}, {"text": "power supply", "freq": 1.2}, {"text": "second layer", "freq": 0.6664497306675388}, {"text": "semiconductor device", "freq": 0.80906001352676}, {"text": "semiconductor layer", "freq": 0.7911562791233471}, {"text": "semiconductor substrate", "freq": 0.9431288684635308}, {"text": "spaced apart", "freq": 0.5049907954259427}, {"text": "storage unit", "freq": 0.5560998001482339}, {"text": "thin film", "freq": 0.6150075162220825}, {"text": "tissue culture", "freq": 0.4648929920260698}, {"text": "top surface", "freq": 0.7128266744449199}, {"text": "unit configured", "freq": 0.7727955980163723}, {"text": "user interface", "freq": 0.6980835935865308}], "links": [{"source": "semiconductor substrate", "target": "semiconductor device", "size": 0.4}, {"source": "gate electrode", "target": "semiconductor substrate", "size": 0.2}, {"source": "internal combustion engine", "target": "internal combustion", "size": 1.0}, {"source": "first surface", "target": "first signal", "size": 0.2}, {"source": "second layer", "target": "first layer", "size": 0.2}, {"source": "composition comprising", "target": "pharmaceutical composition", "size": 0.6000000000000001}, {"source": "light source", "target": "light emitting", "size": 0.2}, {"source": "integrated circuit", "target": "first layer", "size": 0.2}, {"source": "first layer", "target": "second layer", "size": 0.6000000000000001}, {"source": "host computer", "target": "computer system", "size": 0.2}, {"source": "semiconductor device", "target": "device comprises", "size": 0.2}, {"source": "device comprises", "target": "integrated circuit", "size": 0.2}, {"source": "insulating layer", "target": "gate electrode", "size": 0.2}, {"source": "thin film", "target": "display device", "size": 0.2}, {"source": "light source", "target": "host computer", "size": 0.2}, {"source": "spaced apart", "target": "liquid crystal", "size": 0.2}, {"source": "longitudinal axis", "target": "spaced apart", "size": 0.2}, {"source": "liquid crystal display", "target": "crystal display", "size": 1.2}, {"source": "crystal display", "target": "liquid crystal", "size": 1.2}, {"source": "liquid crystal", "target": "layer formed", "size": 0.2}, {"source": "liquid crystal", "target": "power supply", "size": 0.2}, {"source": "semiconductor layer", "target": "first layer", "size": 0.2}, {"source": "memory cell", "target": "memory device", "size": 0.2}, {"source": "semiconductor substrate", "target": "gate electrode", "size": 0.2}, {"source": "gate electrode", "target": "semiconductor device", "size": 0.2}, {"source": "tissue culture", "target": "plant produced", "size": 1.0}, {"source": "electronic device", "target": "device comprises", "size": 0.2}, {"source": "memory device", "target": "integrated circuit", "size": 0.2}, {"source": "liquid crystal", "target": "display device", "size": 0.2}, {"source": "bottom surface", "target": "top surface", "size": 0.8}, {"source": "top surface", "target": "spaced apart", "size": 0.2}, {"source": "first surface", "target": "least partially", "size": 0.2}, {"source": "thin film", "target": "liquid crystal", "size": 0.2}, {"source": "device comprises", "target": "semiconductor device", "size": 0.2}, {"source": "display device", "target": "light emitting", "size": 0.2}, {"source": "power supply", "target": "light emitting", "size": 0.2}, {"source": "top surface", "target": "inner surface", "size": 0.2}, {"source": "plant produced", "target": "tissue culture", "size": 0.2}, {"source": "storage unit", "target": "control unit", "size": 0.2}, {"source": "control unit", "target": "communication device", "size": 0.2}, {"source": "semiconductor layer", "target": "layer formed", "size": 0.2}, {"source": "layer formed", "target": "memory device", "size": 0.2}, {"source": "gate electrode", "target": "insulating layer", "size": 0.2}, {"source": "insulating layer", "target": "semiconductor layer", "size": 0.2}, {"source": "semiconductor layer", "target": "semiconductor substrate", "size": 0.2}, {"source": "semiconductor substrate", "target": "semiconductor layer", "size": 0.2}, {"source": "liquid crystal", "target": "light source", "size": 0.2}, {"source": "layer formed", "target": "least partially", "size": 0.2}, {"source": "insulating layer", "target": "thin film", "size": 0.2}, {"source": "thin film", "target": "circuit board", "size": 0.2}, {"source": "light source", "target": "communication device", "size": 0.2}, {"source": "communication device", "target": "light emitting", "size": 0.2}, {"source": "circuit board", "target": "layer formed", "size": 0.2}, {"source": "semiconductor layer", "target": "thin film", "size": 0.2}, {"source": "thin film", "target": "first layer", "size": 0.2}, {"source": "memory device", "target": "display device", "size": 0.2}, {"source": "computer system", "target": "user interface", "size": 0.2}, {"source": "computer system", "target": "computer program", "size": 0.2}, {"source": "power supply", "target": "electronic device", "size": 0.2}, {"source": "unit configured", "target": "image data", "size": 0.2}, {"source": "image data", "target": "control unit", "size": 0.2}, {"source": "host computer", "target": "display device", "size": 0.2}, {"source": "top surface", "target": "light source", "size": 0.2}, {"source": "circuit board", "target": "control unit", "size": 0.2}, {"source": "device comprises", "target": "light source", "size": 0.2}, {"source": "inner surface", "target": "spaced apart", "size": 0.2}, {"source": "liquid crystal", "target": "gate electrode", "size": 0.2}, {"source": "gate electrode", "target": "spaced apart", "size": 0.2}, {"source": "top surface", "target": "first surface", "size": 0.2}]}]' \ No newline at end of file diff --git a/outputs/fdg/key-terms.json b/outputs/fdg/key-terms.json deleted file mode 100644 index 5e9a28c..0000000 --- a/outputs/fdg/key-terms.json +++ /dev/null @@ -1 +0,0 @@ -data = '[{"nodes": [{"text": "absorber layer", "freq": 0.5577280116665608}, {"text": "air supply", "freq": 0.4218314588766082}, {"text": "catalyst system", "freq": 0.46947087379298164}, {"text": "charge air cooler", "freq": 0.3714746903313664}, {"text": "combustion chamber", "freq": 0.8404564666280396}, {"text": "combustion engine", "freq": 0.8833737327573354}, {"text": "combustion gas", "freq": 0.49637940629126637}, {"text": "control device", "freq": 0.4726868628152021}, {"text": "control signal", "freq": 0.5405014323800565}, {"text": "control unit", "freq": 0.6870767559356139}, {"text": "electrical power", "freq": 0.6194948900716555}, {"text": "electrode assembly", "freq": 0.3903145898802335}, {"text": "electronic control", "freq": 0.45785753445920574}, {"text": "electronic control unit", "freq": 0.38637998618077196}, {"text": "energy storage", "freq": 0.7984048463402005}, {"text": "energy storage unit", "freq": 0.5137820511235243}, {"text": "exhaust gas", "freq": 0.8911067864630173}, {"text": "first period", "freq": 0.35412161624882355}, {"text": "first voltage", "freq": 0.4677785748388333}, {"text": "fuel cell", "freq": 1.2}, {"text": "gas turbine", "freq": 0.491809721865946}, {"text": "generation system", "freq": 0.4437535747196988}, {"text": "heat exchanger", "freq": 0.9867655893110394}, {"text": "internal combustion", "freq": 0.7054883246355204}, {"text": "internal combustion engine", "freq": 0.6745635676615691}, {"text": "least partially", "freq": 0.49063232747867486}, {"text": "metal oxide", "freq": 0.38598231263553406}, {"text": "photovoltaic device", "freq": 0.5665137779743485}, {"text": "positive electrode", "freq": 0.7019768030670732}, {"text": "power consumption", "freq": 0.4257771050416817}, {"text": "power generation", "freq": 0.4633715885843366}, {"text": "power management", "freq": 0.5138158849097184}, {"text": "power save", "freq": 0.47581712135222387}, {"text": "power source", "freq": 0.49213776710203305}, {"text": "power split", "freq": 0.34059739682392864}, {"text": "power split device", "freq": 0.34059739682392864}, {"text": "power supply", "freq": 0.672389446284928}, {"text": "precursor comprising", "freq": 0.36192513420519024}, {"text": "predetermined period", "freq": 0.4141884797809875}, {"text": "second led", "freq": 0.47539748252319913}, {"text": "second period", "freq": 0.30795696918214815}, {"text": "secondary battery", "freq": 0.6230986279388568}, {"text": "solar cell", "freq": 0.5337441419844668}, {"text": "split device", "freq": 0.34059739682392864}, {"text": "storage unit", "freq": 0.583622947061899}, {"text": "thin film", "freq": 0.47747119084011735}, {"text": "transmission system", "freq": 0.315547372920685}, {"text": "unit configured", "freq": 0.5542686017729994}, {"text": "voltage supplied", "freq": 0.3534296750361906}, {"text": "window layer", "freq": 0.5130965157555655}], "links": [{"source": "power generation", "target": "generation system", "size": 0.2}, {"source": "exhaust gas", "target": "air supply", "size": 0.2}, {"source": "catalyst system", "target": "metal oxide", "size": 0.2}, {"source": "electronic control unit", "target": "electronic control", "size": 0.7}, {"source": "electronic control", "target": "control unit", "size": 0.7}, {"source": "generation system", "target": "solar cell", "size": 0.2}, {"source": "split device", "target": "power split device", "size": 0.7}, {"source": "power split device", "target": "power split", "size": 0.7}, {"source": "unit configured", "target": "predetermined period", "size": 0.2}, {"source": "predetermined period", "target": "voltage supplied", "size": 0.2}, {"source": "voltage supplied", "target": "second period", "size": 0.2}, {"source": "second period", "target": "first period", "size": 0.2}, {"source": "first period", "target": "first voltage", "size": 0.2}, {"source": "first voltage", "target": "power source", "size": 0.2}, {"source": "absorber layer", "target": "window layer", "size": 0.2}, {"source": "window layer", "target": "solar cell", "size": 0.2}, {"source": "thin film", "target": "precursor comprising", "size": 0.2}, {"source": "precursor comprising", "target": "photovoltaic device", "size": 0.2}, {"source": "secondary battery", "target": "positive electrode", "size": 0.7}, {"source": "electrical power", "target": "transmission system", "size": 0.2}, {"source": "energy storage unit", "target": "storage unit", "size": 0.7}, {"source": "storage unit", "target": "energy storage", "size": 0.2}, {"source": "internal combustion engine", "target": "internal combustion", "size": 1.2}, {"source": "internal combustion", "target": "combustion engine", "size": 1.2}, {"source": "power save", "target": "power consumption", "size": 0.2}, {"source": "power supply", "target": "internal combustion engine", "size": 0.2}, {"source": "fuel cell", "target": "charge air cooler", "size": 0.2}, {"source": "first voltage", "target": "power management", "size": 0.2}, {"source": "control signal", "target": "first voltage", "size": 0.2}, {"source": "heat exchanger", "target": "combustion gas", "size": 0.2}, {"source": "secondary battery", "target": "control device", "size": 0.2}, {"source": "control device", "target": "positive electrode", "size": 0.2}, {"source": "window layer", "target": "absorber layer", "size": 0.2}, {"source": "absorber layer", "target": "photovoltaic device", "size": 0.2}, {"source": "combustion gas", "target": "fuel cell", "size": 0.2}, {"source": "combustion engine", "target": "control unit", "size": 0.2}, {"source": "gas turbine", "target": "least partially", "size": 0.2}, {"source": "storage unit", "target": "electrical power", "size": 0.2}, {"source": "electrical power", "target": "energy storage", "size": 0.2}, {"source": "combustion gas", "target": "combustion engine", "size": 0.2}, {"source": "combustion engine", "target": "least partially", "size": 0.2}, {"source": "least partially", "target": "combustion chamber", "size": 0.2}, {"source": "combustion chamber", "target": "exhaust gas", "size": 0.2}, {"source": "first period", "target": "second period", "size": 0.2}, {"source": "gas turbine", "target": "heat exchanger", "size": 0.2}, {"source": "control unit", "target": "combustion chamber", "size": 0.2}]}]' \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 4ad7b7c..0000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -matplotlib -numpy -scipy -wordcloud -pandas -tqdm -nltk -scikit-learn diff --git a/scripts/visualization/graphs/fdgprep.py b/scripts/visualization/graphs/fdgprep.py index b0fa2cf..021d34d 100644 --- a/scripts/visualization/graphs/fdgprep.py +++ b/scripts/visualization/graphs/fdgprep.py @@ -94,11 +94,11 @@ def __create_graph_json(self): def save_graph(self, fname, varname): graph = self.__create_graph_json() - file_name = os.path.join('outputs/fdg', fname + '.json') - with open(file_name, 'w') as json_temp: - json_temp.write(varname + " = '[") - json.dump(graph, json_temp) - json_temp.write("]'") + file_name = os.path.join('outputs', 'fdg', fname + '.js') + with open(file_name, 'w') as js_temp: + js_temp.write(varname + " = '[") + json.dump(graph, js_temp) + js_temp.write("]'") def fdg_tfidf(self, tf_idf, tf_idf2, args): num_terms_to_evaluate = 20 diff --git a/scripts/visualization/wordclouds/multicloudplot.py b/scripts/visualization/wordclouds/multicloudplot.py index 5f299ca..77a79f8 100644 --- a/scripts/visualization/wordclouds/multicloudplot.py +++ b/scripts/visualization/wordclouds/multicloudplot.py @@ -1,6 +1,7 @@ import random import matplotlib matplotlib.use('TkAgg') + import matplotlib.pyplot as plt from wordcloud import WordCloud