diff --git a/README.md b/README.md
index 92105ac..51e6a46 100644
--- a/README.md
+++ b/README.md
@@ -27,11 +27,11 @@ The score here is derived from the term [tf-idf](https://en.wikipedia.org/wiki/T
### Word cloud
-Here is a [wordcloud](https://github.com/datasciencecampus/patent_app_detect/output/wordclouds/wordcloud_tech.png) using the Y02 classification on a 10,000 random sample of patents. The greater the tf-idf score, the larger the font size of the term.
+Here is a [wordcloud](https://raw.githubusercontent.com/datasciencecampus/patent_app_detect/master/outputs/wordclouds/wordcloud_tech.png) using the Y02 classification on a 10,000 random sample of patents. The greater the tf-idf score, the larger the font size of the term.
### Force directed graph
-This output provides an [interactive graph](https://github.com/datasciencecampus/patent_app_detect/outputs/fdg/index.html) that shows connections between terms that are generally found in the same patent documents. This example was run for the Y02 classification on a 10,000 random sample of patents.
+This output provides an interactive graph in the to be viewed in a web browser (you need to locally open the file ```outputs/fdg/index.html```). The graph shows connections between terms that are generally found in the same patent documents. The example wordcloud in the ```outputs/fdg``` folder was created using the Y02 classification on a 10,000 random sample of patents.
## How to install
@@ -82,6 +82,27 @@ python detect.py -ps=USPTO-random-10000
Will run the tool for a pre-created random dataset of 10,000 patents.
+### Additional patent sources
+
+Patent datasets are stored in the sub-folder ```data```, we have supplied the following files:
+- ```USPTO-random-100.pkl.bz2```
+- ```USPTO-random-1000.pkl.bz2```
+- ```USPTO-random-10000.pkl.bz2```
+- ```USPTO-random-100000.pkl.bz2```
+- ```USPTO-random-500000.pkl.bz2```
+
+The command ```python detect.py -ps=USPTO-random-10000``` instructs the program to load a pickled data frame of patents
+from a file located in ```data/USPTO-random-10000.pkl.bz2```. Hence ```-ps=NAME``` looks for ```data/NAME.pkl.bz2```.
+
+We have hosted larger datasets on a google drive, as the files are too large for GitHub version control. We have made available:
+- All USPTO patents from 2004 (477Mb): [USPTO-all.pkl.bz2](https://drive.google.com/drive/folders/1d47pizWdKqtORS1zoBzsk3tLk6VZZA4N)
+
+To use additional files, follow the link and download the pickle file into the data folder. Access the new data
+with ```-ps=NameWithoutFileExtension```; for example, ```USPTO-all.pkl.bz2``` would be loaded with ```-ps=USPTO-all```.
+
+Note that large datasets will require a large amount of system memory (such as 64Gb), otherwise it will process very slowly
+as virtual memory (swap) is very likely to be used.
+
### Choosing CPC classification
This subsets the chosen patents dataset to a particular Cooperative Patent Classification (CPC) class, for example Y02. The Y02 classification is for "technologies or applications for mitigation or adaptation against climate change". In this case a larger patent dataset is generally required to allow for the reduction in patent numbers after subsetting. An example script is:
@@ -216,3 +237,9 @@ optional arguments:
the desired cpc classification
```
+
+## Acknowledgements
+
+### Patent data
+
+Patent data was obtained from the [United States Patent and Trademark Office (USPTO)](https://www.uspto.gov) through the [Bulk Data Storage System (BDSS)](https://bulkdata.uspto.gov). In particular we used the `Patent Grant Full Text Data/APS (JAN 1976 - PRESENT)` dataset, using the data from 2004 onwards in XML 4.* format.
diff --git a/detect.py b/detect.py
index 38682d8..5085896 100644
--- a/detect.py
+++ b/detect.py
@@ -109,7 +109,7 @@ def get_tfidf(args, filename, cpc):
def main():
- paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'json'), os.path.join('outputs', 'wordclouds')]
+ paths = [os.path.join('outputs', 'reports'), os.path.join('outputs', 'wordclouds')]
for path in paths:
os.makedirs(path, exist_ok=True)
diff --git a/outputs/fdg/empty.json b/outputs/fdg/empty.json
new file mode 100644
index 0000000..0637a08
--- /dev/null
+++ b/outputs/fdg/empty.json
@@ -0,0 +1 @@
+[]
\ No newline at end of file
diff --git a/outputs/fdg/f.js b/outputs/fdg/f.js
index a9a3927..d50d0a5 100644
--- a/outputs/fdg/f.js
+++ b/outputs/fdg/f.js
@@ -1,10 +1,7 @@
-var dataURL = "http://mysafeinfo.com/api/data?list=englishmonarchs&format=json";
-
+var dataURL = "https://raw.githubusercontent.com/datasciencecampus/patent_app_detect/master/outputs/fdg/empty.json";
var refresh = function(data){
-
-
var json_obj = JSON.parse(data);
var svg = d3.select("svg"),
width = +svg.attr("width"),
@@ -25,40 +22,20 @@ d3.select("div#chartId")
//class to make it responsive
.classed("svg-content-responsive", true);
-//var container = d3.select('body').append('div')
-// .attr('id','container')
-//;
-//
-//// svg#sky
-//var sky = container.append('svg')
-// //.attr('height', 100)
-// //.attr('width', 100)
-// .attr('id', 'sky')
-//;
-
var color = d3.scaleOrdinal(d3.schemeCategory20c);
-//var nodeRadius = 20;
var padding = 1, // separation between circles
radius=6;
-
-
var simulation = d3.forceSimulation()
.force("link", d3.forceLink().id(function(d) {
return d.text;
}).distance(300))
.force("charge", d3.forceManyBody().strength(-100))
.force("center", d3.forceCenter(width / 2, height / 2))
- //.force("gravity", 0.05)
- //.force("linkDistance", 50)
- //.force("size", [9000, 6000])
.force("collide", d3.forceCollide().radius(function(d) {
return 12*radius + padding; }).iterations(40))
-
-
-
d3.json(dataURL, function(error, graph) {
if (error) throw error;
@@ -72,7 +49,6 @@ d3.json(dataURL, function(error, graph) {
.data(graph.links)
.enter().append("line").attr("stroke-width", function(d) {
return (8*d.size);
- //Math.sqrt(1.5*d.size);
});
var node = svg.append("g")
@@ -119,7 +95,6 @@ d3.json(dataURL, function(error, graph) {
.text(function(d) {
return d.text
});
-
simulation
.nodes(graph.nodes)
diff --git a/outputs/fdg/index.html b/outputs/fdg/index.html
index 707e4b1..56efa59 100644
--- a/outputs/fdg/index.html
+++ b/outputs/fdg/index.html
@@ -4,5 +4,5 @@
-
+
diff --git a/outputs/fdg/key-terms.js b/outputs/fdg/key-terms.js
new file mode 100644
index 0000000..a2da386
--- /dev/null
+++ b/outputs/fdg/key-terms.js
@@ -0,0 +1 @@
+data = '[{"nodes": [{"text": "bottom surface", "freq": 0.6726052091113843}, {"text": "circuit board", "freq": 0.7063039273398004}, {"text": "communication device", "freq": 0.8441174450907452}, {"text": "communication system", "freq": 0.6841318043798694}, {"text": "composition comprising", "freq": 0.6989498037779591}, {"text": "computer program", "freq": 0.5944663796903795}, {"text": "computer system", "freq": 0.8236626514636429}, {"text": "control signal", "freq": 0.608818068396678}, {"text": "control system", "freq": 0.7092940998581272}, {"text": "control unit", "freq": 0.7006086451058384}, {"text": "crystal display", "freq": 0.735757252300935}, {"text": "data processing", "freq": 0.5625517519391339}, {"text": "device comprises", "freq": 0.6798825701962133}, {"text": "display device", "freq": 0.6699163987387577}, {"text": "electronic device", "freq": 1.0836145828560166}, {"text": "first layer", "freq": 0.6696081758173493}, {"text": "first signal", "freq": 0.6153414407663305}, {"text": "first surface", "freq": 0.4804165912304299}, {"text": "gate electrode", "freq": 0.6856552419742178}, {"text": "host computer", "freq": 0.7524889882464381}, {"text": "image data", "freq": 0.6588083301554957}, {"text": "inner surface", "freq": 0.5339913211938395}, {"text": "insulating layer", "freq": 0.5796244292088581}, {"text": "integrated circuit", "freq": 0.7369670722710928}, {"text": "internal combustion", "freq": 0.5505976390221616}, {"text": "internal combustion engine", "freq": 0.5505976390221616}, {"text": "layer formed", "freq": 0.593164590717632}, {"text": "least partially", "freq": 0.6625063974987224}, {"text": "light emitting", "freq": 0.8299031005427082}, {"text": "light source", "freq": 1.124916561762969}, {"text": "liquid crystal", "freq": 0.9408618507563702}, {"text": "liquid crystal display", "freq": 0.7180726385545779}, {"text": "longitudinal axis", "freq": 0.701642649124472}, {"text": "memory cell", "freq": 0.813200268633091}, {"text": "memory device", "freq": 0.6680810754385427}, {"text": "nucleic acid", "freq": 0.7956662195388315}, {"text": "pharmaceutical composition", "freq": 0.7732320560420489}, {"text": "plant produced", "freq": 0.5083491104900089}, {"text": "power supply", "freq": 1.2}, {"text": "second layer", "freq": 0.6664497306675388}, {"text": "semiconductor device", "freq": 0.80906001352676}, {"text": "semiconductor layer", "freq": 0.7911562791233471}, {"text": "semiconductor substrate", "freq": 0.9431288684635308}, {"text": "spaced apart", "freq": 0.5049907954259427}, {"text": "storage unit", "freq": 0.5560998001482339}, {"text": "thin film", "freq": 0.6150075162220825}, {"text": "tissue culture", "freq": 0.4648929920260698}, {"text": "top surface", "freq": 0.7128266744449199}, {"text": "unit configured", "freq": 0.7727955980163723}, {"text": "user interface", "freq": 0.6980835935865308}], "links": [{"source": "semiconductor substrate", "target": "semiconductor device", "size": 0.4}, {"source": "gate electrode", "target": "semiconductor substrate", "size": 0.2}, {"source": "internal combustion engine", "target": "internal combustion", "size": 1.0}, {"source": "first surface", "target": "first signal", "size": 0.2}, {"source": "second layer", "target": "first layer", "size": 0.2}, {"source": "composition comprising", "target": "pharmaceutical composition", "size": 0.6000000000000001}, {"source": "light source", "target": "light emitting", "size": 0.2}, {"source": "integrated circuit", "target": "first layer", "size": 0.2}, {"source": "first layer", "target": "second layer", "size": 0.6000000000000001}, {"source": "host computer", "target": "computer system", "size": 0.2}, {"source": "semiconductor device", "target": "device comprises", "size": 0.2}, {"source": "device comprises", "target": "integrated circuit", "size": 0.2}, {"source": "insulating layer", "target": "gate electrode", "size": 0.2}, {"source": "thin film", "target": "display device", "size": 0.2}, {"source": "light source", "target": "host computer", "size": 0.2}, {"source": "spaced apart", "target": "liquid crystal", "size": 0.2}, {"source": "longitudinal axis", "target": "spaced apart", "size": 0.2}, {"source": "liquid crystal display", "target": "crystal display", "size": 1.2}, {"source": "crystal display", "target": "liquid crystal", "size": 1.2}, {"source": "liquid crystal", "target": "layer formed", "size": 0.2}, {"source": "liquid crystal", "target": "power supply", "size": 0.2}, {"source": "semiconductor layer", "target": "first layer", "size": 0.2}, {"source": "memory cell", "target": "memory device", "size": 0.2}, {"source": "semiconductor substrate", "target": "gate electrode", "size": 0.2}, {"source": "gate electrode", "target": "semiconductor device", "size": 0.2}, {"source": "tissue culture", "target": "plant produced", "size": 1.0}, {"source": "electronic device", "target": "device comprises", "size": 0.2}, {"source": "memory device", "target": "integrated circuit", "size": 0.2}, {"source": "liquid crystal", "target": "display device", "size": 0.2}, {"source": "bottom surface", "target": "top surface", "size": 0.8}, {"source": "top surface", "target": "spaced apart", "size": 0.2}, {"source": "first surface", "target": "least partially", "size": 0.2}, {"source": "thin film", "target": "liquid crystal", "size": 0.2}, {"source": "device comprises", "target": "semiconductor device", "size": 0.2}, {"source": "display device", "target": "light emitting", "size": 0.2}, {"source": "power supply", "target": "light emitting", "size": 0.2}, {"source": "top surface", "target": "inner surface", "size": 0.2}, {"source": "plant produced", "target": "tissue culture", "size": 0.2}, {"source": "storage unit", "target": "control unit", "size": 0.2}, {"source": "control unit", "target": "communication device", "size": 0.2}, {"source": "semiconductor layer", "target": "layer formed", "size": 0.2}, {"source": "layer formed", "target": "memory device", "size": 0.2}, {"source": "gate electrode", "target": "insulating layer", "size": 0.2}, {"source": "insulating layer", "target": "semiconductor layer", "size": 0.2}, {"source": "semiconductor layer", "target": "semiconductor substrate", "size": 0.2}, {"source": "semiconductor substrate", "target": "semiconductor layer", "size": 0.2}, {"source": "liquid crystal", "target": "light source", "size": 0.2}, {"source": "layer formed", "target": "least partially", "size": 0.2}, {"source": "insulating layer", "target": "thin film", "size": 0.2}, {"source": "thin film", "target": "circuit board", "size": 0.2}, {"source": "light source", "target": "communication device", "size": 0.2}, {"source": "communication device", "target": "light emitting", "size": 0.2}, {"source": "circuit board", "target": "layer formed", "size": 0.2}, {"source": "semiconductor layer", "target": "thin film", "size": 0.2}, {"source": "thin film", "target": "first layer", "size": 0.2}, {"source": "memory device", "target": "display device", "size": 0.2}, {"source": "computer system", "target": "user interface", "size": 0.2}, {"source": "computer system", "target": "computer program", "size": 0.2}, {"source": "power supply", "target": "electronic device", "size": 0.2}, {"source": "unit configured", "target": "image data", "size": 0.2}, {"source": "image data", "target": "control unit", "size": 0.2}, {"source": "host computer", "target": "display device", "size": 0.2}, {"source": "top surface", "target": "light source", "size": 0.2}, {"source": "circuit board", "target": "control unit", "size": 0.2}, {"source": "device comprises", "target": "light source", "size": 0.2}, {"source": "inner surface", "target": "spaced apart", "size": 0.2}, {"source": "liquid crystal", "target": "gate electrode", "size": 0.2}, {"source": "gate electrode", "target": "spaced apart", "size": 0.2}, {"source": "top surface", "target": "first surface", "size": 0.2}]}]'
\ No newline at end of file
diff --git a/outputs/fdg/key-terms.json b/outputs/fdg/key-terms.json
deleted file mode 100644
index 5e9a28c..0000000
--- a/outputs/fdg/key-terms.json
+++ /dev/null
@@ -1 +0,0 @@
-data = '[{"nodes": [{"text": "absorber layer", "freq": 0.5577280116665608}, {"text": "air supply", "freq": 0.4218314588766082}, {"text": "catalyst system", "freq": 0.46947087379298164}, {"text": "charge air cooler", "freq": 0.3714746903313664}, {"text": "combustion chamber", "freq": 0.8404564666280396}, {"text": "combustion engine", "freq": 0.8833737327573354}, {"text": "combustion gas", "freq": 0.49637940629126637}, {"text": "control device", "freq": 0.4726868628152021}, {"text": "control signal", "freq": 0.5405014323800565}, {"text": "control unit", "freq": 0.6870767559356139}, {"text": "electrical power", "freq": 0.6194948900716555}, {"text": "electrode assembly", "freq": 0.3903145898802335}, {"text": "electronic control", "freq": 0.45785753445920574}, {"text": "electronic control unit", "freq": 0.38637998618077196}, {"text": "energy storage", "freq": 0.7984048463402005}, {"text": "energy storage unit", "freq": 0.5137820511235243}, {"text": "exhaust gas", "freq": 0.8911067864630173}, {"text": "first period", "freq": 0.35412161624882355}, {"text": "first voltage", "freq": 0.4677785748388333}, {"text": "fuel cell", "freq": 1.2}, {"text": "gas turbine", "freq": 0.491809721865946}, {"text": "generation system", "freq": 0.4437535747196988}, {"text": "heat exchanger", "freq": 0.9867655893110394}, {"text": "internal combustion", "freq": 0.7054883246355204}, {"text": "internal combustion engine", "freq": 0.6745635676615691}, {"text": "least partially", "freq": 0.49063232747867486}, {"text": "metal oxide", "freq": 0.38598231263553406}, {"text": "photovoltaic device", "freq": 0.5665137779743485}, {"text": "positive electrode", "freq": 0.7019768030670732}, {"text": "power consumption", "freq": 0.4257771050416817}, {"text": "power generation", "freq": 0.4633715885843366}, {"text": "power management", "freq": 0.5138158849097184}, {"text": "power save", "freq": 0.47581712135222387}, {"text": "power source", "freq": 0.49213776710203305}, {"text": "power split", "freq": 0.34059739682392864}, {"text": "power split device", "freq": 0.34059739682392864}, {"text": "power supply", "freq": 0.672389446284928}, {"text": "precursor comprising", "freq": 0.36192513420519024}, {"text": "predetermined period", "freq": 0.4141884797809875}, {"text": "second led", "freq": 0.47539748252319913}, {"text": "second period", "freq": 0.30795696918214815}, {"text": "secondary battery", "freq": 0.6230986279388568}, {"text": "solar cell", "freq": 0.5337441419844668}, {"text": "split device", "freq": 0.34059739682392864}, {"text": "storage unit", "freq": 0.583622947061899}, {"text": "thin film", "freq": 0.47747119084011735}, {"text": "transmission system", "freq": 0.315547372920685}, {"text": "unit configured", "freq": 0.5542686017729994}, {"text": "voltage supplied", "freq": 0.3534296750361906}, {"text": "window layer", "freq": 0.5130965157555655}], "links": [{"source": "power generation", "target": "generation system", "size": 0.2}, {"source": "exhaust gas", "target": "air supply", "size": 0.2}, {"source": "catalyst system", "target": "metal oxide", "size": 0.2}, {"source": "electronic control unit", "target": "electronic control", "size": 0.7}, {"source": "electronic control", "target": "control unit", "size": 0.7}, {"source": "generation system", "target": "solar cell", "size": 0.2}, {"source": "split device", "target": "power split device", "size": 0.7}, {"source": "power split device", "target": "power split", "size": 0.7}, {"source": "unit configured", "target": "predetermined period", "size": 0.2}, {"source": "predetermined period", "target": "voltage supplied", "size": 0.2}, {"source": "voltage supplied", "target": "second period", "size": 0.2}, {"source": "second period", "target": "first period", "size": 0.2}, {"source": "first period", "target": "first voltage", "size": 0.2}, {"source": "first voltage", "target": "power source", "size": 0.2}, {"source": "absorber layer", "target": "window layer", "size": 0.2}, {"source": "window layer", "target": "solar cell", "size": 0.2}, {"source": "thin film", "target": "precursor comprising", "size": 0.2}, {"source": "precursor comprising", "target": "photovoltaic device", "size": 0.2}, {"source": "secondary battery", "target": "positive electrode", "size": 0.7}, {"source": "electrical power", "target": "transmission system", "size": 0.2}, {"source": "energy storage unit", "target": "storage unit", "size": 0.7}, {"source": "storage unit", "target": "energy storage", "size": 0.2}, {"source": "internal combustion engine", "target": "internal combustion", "size": 1.2}, {"source": "internal combustion", "target": "combustion engine", "size": 1.2}, {"source": "power save", "target": "power consumption", "size": 0.2}, {"source": "power supply", "target": "internal combustion engine", "size": 0.2}, {"source": "fuel cell", "target": "charge air cooler", "size": 0.2}, {"source": "first voltage", "target": "power management", "size": 0.2}, {"source": "control signal", "target": "first voltage", "size": 0.2}, {"source": "heat exchanger", "target": "combustion gas", "size": 0.2}, {"source": "secondary battery", "target": "control device", "size": 0.2}, {"source": "control device", "target": "positive electrode", "size": 0.2}, {"source": "window layer", "target": "absorber layer", "size": 0.2}, {"source": "absorber layer", "target": "photovoltaic device", "size": 0.2}, {"source": "combustion gas", "target": "fuel cell", "size": 0.2}, {"source": "combustion engine", "target": "control unit", "size": 0.2}, {"source": "gas turbine", "target": "least partially", "size": 0.2}, {"source": "storage unit", "target": "electrical power", "size": 0.2}, {"source": "electrical power", "target": "energy storage", "size": 0.2}, {"source": "combustion gas", "target": "combustion engine", "size": 0.2}, {"source": "combustion engine", "target": "least partially", "size": 0.2}, {"source": "least partially", "target": "combustion chamber", "size": 0.2}, {"source": "combustion chamber", "target": "exhaust gas", "size": 0.2}, {"source": "first period", "target": "second period", "size": 0.2}, {"source": "gas turbine", "target": "heat exchanger", "size": 0.2}, {"source": "control unit", "target": "combustion chamber", "size": 0.2}]}]'
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 4ad7b7c..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-matplotlib
-numpy
-scipy
-wordcloud
-pandas
-tqdm
-nltk
-scikit-learn
diff --git a/scripts/visualization/graphs/fdgprep.py b/scripts/visualization/graphs/fdgprep.py
index b0fa2cf..021d34d 100644
--- a/scripts/visualization/graphs/fdgprep.py
+++ b/scripts/visualization/graphs/fdgprep.py
@@ -94,11 +94,11 @@ def __create_graph_json(self):
def save_graph(self, fname, varname):
graph = self.__create_graph_json()
- file_name = os.path.join('outputs/fdg', fname + '.json')
- with open(file_name, 'w') as json_temp:
- json_temp.write(varname + " = '[")
- json.dump(graph, json_temp)
- json_temp.write("]'")
+ file_name = os.path.join('outputs', 'fdg', fname + '.js')
+ with open(file_name, 'w') as js_temp:
+ js_temp.write(varname + " = '[")
+ json.dump(graph, js_temp)
+ js_temp.write("]'")
def fdg_tfidf(self, tf_idf, tf_idf2, args):
num_terms_to_evaluate = 20
diff --git a/scripts/visualization/wordclouds/multicloudplot.py b/scripts/visualization/wordclouds/multicloudplot.py
index 5f299ca..77a79f8 100644
--- a/scripts/visualization/wordclouds/multicloudplot.py
+++ b/scripts/visualization/wordclouds/multicloudplot.py
@@ -1,6 +1,7 @@
import random
import matplotlib
matplotlib.use('TkAgg')
+
import matplotlib.pyplot as plt
from wordcloud import WordCloud