forked from fabian-beck/survis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_data.py
147 lines (124 loc) · 4.72 KB
/
update_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import json
import codecs
import time
BASE_DIR = os.path.dirname(__file__)
DATA_DIR = os.path.join(BASE_DIR, "src/data/")
PAPERS_DIR = os.path.join(DATA_DIR, "papers_pdf/")
PAPERS_IMG_DIR = os.path.join(DATA_DIR, "papers_img/")
BIB_FILE = os.path.join(BASE_DIR, "bib/references.bib")
GENERATED_DIR = os.path.join(DATA_DIR, "generated/")
BIB_JS_FILE = os.path.join(GENERATED_DIR, "bib.js")
AVAILABLE_PDF_FILE = os.path.join(GENERATED_DIR, "available_pdf.js")
AVAILABLE_IMG_FILE = os.path.join(GENERATED_DIR, "available_img.js")
# Boolean that controls Thumbnail Generation.
# To enable this feature, please install pdf2image.
# See: https://github.com/Belval/pdf2image
CREATE_THUMBNAILS = False
if CREATE_THUMBNAILS:
import tempfile
from pdf2image import convert_from_path
def parseBibtex(bibFile):
parsedData = {}
lastField = ""
with codecs.open(bibFile, "r", "utf-8-sig") as fIn:
currentId = ""
for line in fIn:
line = line.strip("\n").strip("\r")
if line.startswith("@Comment"):
continue
if line.startswith("@"):
currentId = line.split("{")[1].rstrip(",\n")
currentType = line.split("{")[0].strip("@ ")
parsedData[currentId] = {"type": currentType}
if currentId != "":
if "=" in line:
field = line.split("=")[0].strip().lower()
value = line.split("=")[1].strip("} \n")
if value.endswith("},"):
value = value[:-2]
if len(value) > 0 and value[0] == "{":
value = value[1:]
if field in parsedData[currentId]:
parsedData[currentId][field] = parsedData[currentId][field] + " " + value
else:
parsedData[currentId][field] = value
lastField = field
else:
if lastField in parsedData[currentId]:
value = line.strip()
value = value.strip("} \n").replace("},", "").strip()
if len(value) > 0:
parsedData[currentId][lastField] = parsedData[currentId][field] + " " + value
fIn.close()
return parsedData
def writeJSON(parsedData):
with codecs.open(BIB_JS_FILE, "w", "utf-8-sig") as fOut:
fOut.write("define({ entries : ")
fOut.write(json.dumps(parsedData, sort_keys=True, indent=4, separators=(',', ': ')))
fOut.write("});")
fOut.close()
def listAvailablePdf():
# papersDirWin = papersDir.replace("/", "\\")
fOut = open(AVAILABLE_PDF_FILE, "w")
s = "define({availablePdf: ["
count = 0
for file in os.listdir(PAPERS_DIR):
if file.endswith(".pdf"):
s += "\"" + file.replace(".pdf", "") + "\","
count += 1
if CREATE_THUMBNAILS:
create_thumbnail(file)
if count > 0:
s = s[:len(s) - 1]
s += "]});"
fOut.write(s)
def listAvailableImg():
fOut = open(AVAILABLE_IMG_FILE, "w")
s = "define({ availableImg: ["
count = 0
for file in os.listdir(PAPERS_IMG_DIR):
if file.endswith(".png"):
s += "\"" + file.replace(".png", "") + "\","
count += 1
if count > 0:
s = s[:len(s) - 1]
s += "]});"
fOut.write(s)
def create_thumbnail(file):
pdf_path = os.path.join(PAPERS_DIR, file)
thumbnail_path = os.path.join(PAPERS_IMG_DIR, file.replace(".pdf", ".png"))
if os.path.isfile(thumbnail_path):
print(f"Skipping thumbnail generation for existing file {thumbnail_path}")
else:
print(f"Generate thumbnail for {file} and save it to {thumbnail_path}")
with tempfile.TemporaryDirectory() as path:
pages = convert_from_path(pdf_path, 72, output_folder=path, last_page=1, fmt="png")
pages[0].save(thumbnail_path)
print("Done.")
def update():
print("convert bib file")
writeJSON(parseBibtex(BIB_FILE))
print("list available paper PDF files")
listAvailablePdf()
print("list available paper images")
listAvailableImg()
print("done")
def generate_folders():
for d in [GENERATED_DIR, PAPERS_DIR, PAPERS_IMG_DIR]:
try:
os.makedirs(d)
except FileExistsError:
pass
if __name__ == '__main__':
generate_folders()
prevBibTime = 0
while True:
currentBibTime = os.stat(BIB_FILE).st_mtime
if prevBibTime != currentBibTime:
print("detected change in bib file")
update()
prevBibTime = currentBibTime
else:
print("waiting for changes in bib file: " + BIB_FILE)
time.sleep(1)