-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
515 lines (423 loc) · 19.1 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
# app.py
from flask import Flask, render_template, request, redirect, url_for, session, jsonify, Response, flash
from flask_session import Session
from flask_talisman import Talisman
import os
import threading
import uuid
import json
from backend.processing import (extract_seed, get_keywords, scopus_sampling_process)
import logging
from logging.handlers import TimedRotatingFileHandler
# Create logs directory if it doesn't exist
if not os.path.exists('logs'):
os.mkdir('logs')
app = Flask(__name__)
def setup_logging():
"""
Configure logging for the Flask application.
"""
# Create a file handler that logs debug and higher level messages
log_formatter = logging.Formatter(
'%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
)
log_file = os.path.join('logs', 'app.log')
# Create a rotating file handler that creates a new log file every day
file_handler = TimedRotatingFileHandler(
log_file, when='midnight', interval=1, backupCount=7
)
file_handler.setFormatter(log_formatter)
file_handler.setLevel(logging.INFO)
# Add the handler to the app's logger
app.logger.addHandler(file_handler)
app.logger.setLevel(logging.INFO)
# Log that logging is set up
app.logger.info('Logging setup complete.')
csp = {
'default-src': [
"'self'",
'https://stackpath.bootstrapcdn.com', # Bootstrap CDN
'https://cdn.jsdelivr.net', # Any other CDN you're using
],
'script-src': [
"'self'",
"'unsafe-inline'", # Allows inline scripts (use with caution)
'https://code.jquery.com', # jQuery CDN
],
'style-src': [
"'self'",
"'unsafe-inline'", # Allows inline styles
'https://stackpath.bootstrapcdn.com', # Bootstrap CSS
],
'img-src': [
"'self'",
'data:', # Allows data URIs for images
],
'font-src': [
"'self'",
'https://fonts.gstatic.com', # Google Fonts
],
}
# Apply Content Security Policy using Talisman
Talisman(app, content_security_policy=csp)
app.secret_key = os.getenv('SECRET_KEY', 'your_default_secret_key')
app.config['MAX_CONTENT_LENGTH'] = 2 * 16 * 1024 * 1024 # 32 MB
app.config['SESSION_TYPE'] = 'filesystem'
app.config['SESSION_FILE_DIR'] = os.path.join(app.instance_path, 'sessions')
app.config['SESSION_PERMANENT'] = False
app.config['SESSION_USE_SIGNER'] = True
app.config['SESSION_COOKIE_HTTPONLY'] = True
app.config['SESSION_COOKIE_SECURE'] = True # Set to True in production
app.config['SESSION_COOKIE_SAMESITE'] = 'Lax'
Session(app)
# Call the logging setup function
setup_logging()
# Global dictionaries to store progress and results
progress_info = {}
ranked_results = {}
# Allowed extensions for API key upload
ALLOWED_EXTENSIONS = {'json'}
def allowed_file(filename):
"""
Check if a filename has an allowed extension.
"""
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/')
def index():
"""
Render the home page of the application.
This route serves the landing page where users can upload their seed corpus
and Scopus API key. It also clears any existing 'scopus_api_key' from the session
to ensure a fresh start for each new session.
Returns:
- Response: Renders the 'index.html' template.
"""
session.pop('scopus_api_key', None)
return render_template('index.html')
def load_scopus_api_key(request):
"""
Load and validate the Scopus API key file from the request.
Parameters:
- request (Request): The Flask request object containing the uploaded files.
Returns:
- None: If the API key is successfully loaded and validated.
- Response: Redirects to the index page with an error message if validation fails.
This function performs the following steps:
- Checks if the 'scopusApiKey' file is present in the request and has a valid extension.
- Attempts to parse the file as JSON.
- Validates that the JSON contains 'apikey' and 'insttoken' fields.
- Stores the valid API key in the session.
If any step fails, it flashes an error message to the user and redirects to the index page.
"""
api_key_file = request.files.get('scopusApiKey')
# Check if file is provided and if it's a valid JSON file
if not api_key_file or not allowed_file(api_key_file.filename):
flash("No API Key file uploaded or invalid file type. Please upload a valid JSON file.", "error")
return redirect(url_for('index'))
try:
# Try to load the file as JSON
api_key_json = json.load(api_key_file)
# Validate that 'apikey' and 'insttoken' are present in the JSON
if 'apikey' in api_key_json and 'insttoken' in api_key_json:
# Store the valid API key in the session
session['scopus_api_key'] = api_key_json
return None # No error, continue execution
else:
# Missing required fields in the JSON
flash("Invalid API Key structure: Missing 'apikey' or 'insttoken'.", "error")
return redirect(url_for('index'))
except json.JSONDecodeError as e:
# Handle invalid JSON format
flash("Invalid JSON file. Please upload a correctly formatted JSON file.", "error")
return redirect(url_for('index'))
except Exception as e:
# Catch any other unexpected errors
flash(f"An unexpected error occurred: {e}", "error")
return redirect(url_for('index'))
@app.route('/extract_keywords', methods=['POST'])
def extract_keywords():
"""
Process the uploaded seed corpus and Scopus API key to extract initial keywords.
This route handles the form submission from the index page. It performs the following steps:
- Validates that both 'seedCorpus' and 'scopusApiKey' files are provided.
- Retrieves form data for 'threshold', 'num_keywords', and 'iterations'.
- Calls 'extract_seed' to process the uploaded PDF files and extract text.
- Calls 'load_scopus_api_key' to validate and store the Scopus API key.
- Calls 'get_keywords' to extract keywords from the seed data.
- Logs extracted keywords and parameters.
- Renders 'refine_keywords.html' template for user to refine the keywords.
Returns:
- Response: Renders 'refine_keywords.html' with extracted keywords.
- Redirect: Redirects to 'index' if validation fails.
"""
# Check if the user has uploaded both seedCorpus and scopusApiKey
if 'seedCorpus' not in request.files or 'scopusApiKey' not in request.files:
flash("Please upload both the seed corpus and Scopus API key.", "error")
return redirect(url_for('index'))
files = request.files.getlist('seedCorpus')
threshold = request.form.get('threshold', default=100, type=int)
num_keywords = request.form.get('num_keywords', default=20, type=int)
iterations = request.form.get('iterations', default=10, type=int)
# Process the uploaded files and extract seed data
seed_data = extract_seed(files)
# Process Scopus API Key, and handle the case where it redirects
api_key_error = load_scopus_api_key(request)
if api_key_error:
return api_key_error # If an error occurred, stop and redirect
# Extract keywords using backend function
keywords = get_keywords(seed_data, num_keywords)
# Store parameters in session for later use
session['threshold'] = threshold
session['iterations'] = iterations
app.logger.info(f"Extracted keywords: {[kw['word'] for kw in keywords]}")
app.logger.info(f"Threshold: {threshold}, Number of Keywords: {num_keywords}")
app.logger.info(f"Iterations: {iterations}")
return render_template('refine_keywords.html', keywords=keywords)
@app.route('/process_refined_keywords', methods=['POST'])
def process_refined_keywords():
"""
Process the refined keywords and their weights submitted by the user.
This route handles the form submission from the 'refine_keywords' page. It performs the following steps:
- Retrieves selected keywords and their corresponding weights from the form data.
- Handles any new keywords and weights added by the user.
- Constructs a 'weight_dict' mapping keywords to their weights.
- Stores 'weight_dict' in the session for use in the sampling process.
- Logs the processed keywords and weights.
- Renders 'auto_submit_start_sampling.html' to automatically initiate sampling.
Returns:
- Response: Renders 'auto_submit_start_sampling.html' to start sampling.
"""
# Retrieve selected keywords and weights from the form
selected_keywords = request.form.getlist('selected_keywords')
weight_dict = {}
# Access weights using the composite key 'weights[keyword]'
for keyword in selected_keywords:
weight_key = f'weights[{keyword}]'
weight_value = request.form.get(weight_key, default='0')
try:
weight = float(weight_value)
except ValueError:
weight = 0.0
weight_dict[keyword] = weight
# Handle new keywords added by the user
new_keywords = request.form.getlist('new_keywords[]')
new_weights = request.form.getlist('new_weights[]')
for keyword, weight in zip(new_keywords, new_weights):
if keyword.strip(): # Ignore empty strings
try:
weight_float = float(weight) if weight else 0.0
except ValueError:
weight_float = 0.0
weight_dict[keyword.strip()] = weight_float
# Store the weight dictionary in the session
session['weight_dict'] = weight_dict
app.logger.info(f"Processed refined keywords: {weight_dict}")
# Render the auto-submit page to start the sampling process
return render_template('auto_submit_start_sampling.html')
def update_progress(sampling_id, outer_iter, outer_iterations, query, match_count):
"""
Update the progress information for a given sampling ID during the sampling process.
Parameters:
- sampling_id (str): Unique identifier for the sampling process.
- outer_iter (int): Current outer iteration number.
- outer_iterations (int): Total number of outer iterations.
- query (str): The current search query being processed.
- match_count (int): Number of matches returned by the current query.
This function updates the 'progress_info' global dictionary with the latest iteration details,
including the current iteration, query, match count, and appends to the history.
The 'progress_info' is used to track the sampling progress and is accessed by the
'/sampling_progress/<sampling_id>' route to provide real-time updates to the user.
"""
if sampling_id not in progress_info:
progress_info[sampling_id] = {
'current_outer_iteration': 0,
'outer_iterations': 0,
'current_query': '',
'last_match_count': 0,
'status': 'running',
'history': []
}
progress_info[sampling_id]['current_outer_iteration'] = outer_iter
progress_info[sampling_id]['current_query'] = query
progress_info[sampling_id]['last_match_count'] = match_count
progress_info[sampling_id]['outer_iterations'] = outer_iterations
# Append to history
progress_info[sampling_id]['history'].append({
'outer_iteration': outer_iter,
'query': query,
'match_count': match_count
})
@app.route('/start_sampling', methods=['POST'])
def start_sampling():
"""
Initiate the sampling process using the refined keywords and parameters.
This route performs the following steps:
- Retrieves 'weight_dict', 'threshold', and 'iterations' from the session.
- Validates again the presence of the Scopus API key.
- Generates a unique 'sampling_id' and initializes progress tracking.
- Defines and starts a background thread to run the sampling process.
- Renders 'processing.html' to display sampling progress to the user.
Returns:
- Response: Renders 'processing.html' with the 'sampling_id'.
- Response: Returns an error message if required data is missing.
"""
# Retrieve refined keywords and parameters from the session
weight_dict = session.get('weight_dict', {})
threshold = session.get('threshold', 100) # Default to 100 if not set
outer_iterations = session.get('iterations', 10) # Default to 10 if not set
# Retrieve Scopus API Key from the session
scopus_api_key = session.get('scopus_api_key', {})
if not scopus_api_key:
app.logger.warning("No Scopus API Key found in session.")
return "Scopus API Key not found. Please upload your API key.", 400
if not weight_dict:
app.logger.warning("No keywords available for sampling. Redirecting to index.", flush=True)
return redirect(url_for('index'))
# Generate a unique sampling ID
sampling_id = str(uuid.uuid4())
# Initialize progress information
progress_info[sampling_id] = {
'current_outer_iteration': 0,
'outer_iterations': outer_iterations,
'queries': [],
'matches_per_query': [],
'status': 'running',
'history': []
}
# Store sampling_id in the session
session['sampling_id'] = sampling_id
# Define the sampling thread function
def run_sampling():
app.logger.info(f"Starting sampling thread for Sampling ID: {sampling_id}")
# Define the progress_callback
def progress_callback(outer_iter, query, match_count):
update_progress(sampling_id, outer_iter, outer_iterations, query, match_count)
#app.logger.info(f"Progress Update - Outer Iteration {outer_iter}: Query='{query}' | Matches={match_count}")
# Call scopus_sampling_process with the API key
ranked = scopus_sampling_process(
weight_dict=weight_dict,
threshold=threshold,
outer_iterations=outer_iterations,
progress_callback=progress_callback,
scopus_api_key=scopus_api_key
)
ranked_results[sampling_id] = ranked
progress_info[sampling_id]['status'] = 'completed'
app.logger.info(f"Sampling thread for Sampling ID: {sampling_id} completed.")
# Start the sampling in a separate thread
thread = threading.Thread(target=run_sampling, daemon=True)
thread.start()
return render_template('processing.html', sampling_id=sampling_id)
@app.route('/sampling_progress/<sampling_id>')
def sampling_progress(sampling_id):
"""
Provide real-time progress updates for the sampling process.
Parameters:
- sampling_id (str): Unique identifier for the sampling process.
Returns:
- Response: A JSON object containing the current progress information for the given 'sampling_id'.
The progress information includes:
- 'current_outer_iteration': The current outer iteration number.
- 'outer_iterations': The total number of outer iterations.
- 'current_query': The most recent query executed.
- 'last_match_count': The number of matches from the last query.
- 'status': The current status of the sampling process ('running', 'completed', etc.).
- 'history': A list of dictionaries recording the history of queries and match counts.
"""
info = progress_info.get(sampling_id, {})
return jsonify(info)
@app.route('/results')
def results():
"""
Display the results of the sampling process to the user.
This route performs the following steps:
- Retrieves 'sampling_id' from the session.
- Validates that sampling results are available for the 'sampling_id'.
- Retrieves the ranked list of papers from 'ranked_results'.
- Logs warnings if results are missing.
- Clears 'scopus_api_key' from the session for security.
- Renders 'results.html' with the list of ranked papers.
Returns:
- Response: Renders 'results.html' with the sampling results.
- Redirect: Redirects to 'index' if no results are found.
"""
sampling_id = session.get('sampling_id', None)
if not sampling_id or sampling_id not in ranked_results:
app.logger.warning(f"No sampling results found for Sampling ID: {sampling_id}")
return redirect(url_for('index'))
ranked_papers = ranked_results.get(sampling_id, [])
if not ranked_papers:
app.logger.warning(f"Ranked papers list is empty for Sampling ID: {sampling_id}")
session.pop('scopus_api_key', None)
return render_template('results.html', papers=ranked_papers)
@app.route('/download_results')
def download_results():
"""
Provide a downloadable CSV file containing the sampling results.
This route performs the following steps:
- Retrieves 'sampling_id' from the session.
- Validates that sampling results are available.
- Constructs a CSV file in memory containing the ranked papers.
- Sets appropriate headers to prompt the user to download the file.
Returns:
- Response: An HTTP response with the CSV data and headers for file download.
- Redirect: Redirects to 'index' if no results are found.
The CSV file includes the following fields:
- 'Occurrences', 'First Author', 'Year', 'Title', 'Journal', 'Citations', 'Open Access', 'Link'.
"""
sampling_id = session.get('sampling_id', None)
if not sampling_id or sampling_id not in ranked_results:
return redirect(url_for('index'))
papers = ranked_results[sampling_id]
# Create CSV data
import csv
import io
# Initialize BytesIO and TextIOWrapper without 'with' statement
si = io.BytesIO()
text_io = io.TextIOWrapper(si, encoding='utf-8-sig', newline='')
fieldnames = ['Occurrences', 'First Author', 'Year', 'Title', 'Journal', 'Citations', 'Open Access', 'Link']
writer = csv.DictWriter(text_io, fieldnames=fieldnames)
writer.writeheader()
for paper in papers:
writer.writerow({
'Occurrences': paper['occurrences'],
'First Author': paper['first_author'],
'Year': paper['year'],
'Title': paper['title'],
'Journal': paper['journal'],
'Citations': paper['citations'],
'Open Access': paper['open_access'],
'Link': paper['link']
})
# Flush the TextIOWrapper to ensure all data is written to BytesIO
text_io.flush()
# Seek to the beginning of BytesIO
si.seek(0)
# Read the content from BytesIO
output = si.getvalue()
# Close TextIOWrapper and BytesIO if desired
text_io.close()
si.close()
# Return the CSV data as an HTTP response with appropriate headers
return Response(
output,
mimetype='text/csv; charset=utf-8',
headers={'Content-Disposition': 'attachment;filename=results.csv'}
)
@app.route('/settings', methods=['GET', 'POST'])
def settings():
"""
Handle the settings page where users can view or update application settings.
Methods:
- GET: Renders the 'settings.html' template to display current settings.
- POST: Processes form submissions to update settings (currently a placeholder).
Returns:
- Response: Renders 'settings.html' template.
"""
if request.method == 'POST':
# Handle settings update
pass
return render_template('settings.html')
if __name__ == '__main__':
app.run(debug=True)