diff --git a/docker/exporter/exporter.py b/docker/exporter/exporter.py index a018a5b806b..72ee0f2c0cd 100755 --- a/docker/exporter/exporter.py +++ b/docker/exporter/exporter.py @@ -102,6 +102,7 @@ def _export_ecosystem_to_bucket(self, ecosystem: str, tmp_dir: str): zip_path = os.path.join(tmp_dir, 'all.zip') with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_file: + files_to_zip = [] @ndb.tasklet def _export_to_file_and_zipfile(bug): @@ -112,15 +113,18 @@ def _export_to_file_and_zipfile(bug): file_path = os.path.join(tmp_dir, bug.id() + '.json') vulnerability = yield bug.to_vulnerability_async(include_source=True) osv.write_vulnerability(vulnerability, file_path) - # Tasklets are not truly multiple threads;they are actually - # event loops, which makes it safe to write to ZIP files." - # Details: https://cloud.google.com/appengine/docs/legacy/ - # standard/python/ndb/async#tasklets - zip_file.write(file_path, os.path.basename(file_path)) + files_to_zip.append(file_path) + + # This *should* pause here until + # all the exports have been written to disk. osv.Bug.query( osv.Bug.ecosystem == ecosystem).map(_export_to_file_and_zipfile) + files_to_zip.sort() + for file_path in files_to_zip: + zip_file.write(file_path, os.path.basename(file_path)) + with concurrent.futures.ThreadPoolExecutor( max_workers=_EXPORT_WORKERS) as executor: # Note: all.zip is included here