Skip to content

Commit

Permalink
adding source into dumpjson
Browse files Browse the repository at this point in the history
  • Loading branch information
aysim319 committed Oct 15, 2024
1 parent c992ad2 commit 7e1f53e
Showing 1 changed file with 16 additions and 13 deletions.
29 changes: 16 additions & 13 deletions archive_differ_vs_api/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@

CSV_PATH = Path(__file__).parent / 'diff_csv_joined'

def dump_json(data):
f = open(f'{S3_SOURCE}.json', 'a')
def dump_json(data, source):
f = open(f'{source}.json', 'a')
json.dump(data, f)
f.write(",\n")
f.close()
Expand All @@ -88,7 +88,7 @@ def parse_bucket_info(obj) -> Dict:
csvname_s3 = prefixes[1]
if len(csvname_s3) == 0:
row = {"file_name": obj.key, "source": source_api, "skip": True, "reason": "file has no name"}
dump_json(row)
dump_json(row, source_api)
return dict()

# time_value
Expand All @@ -100,7 +100,7 @@ def parse_bucket_info(obj) -> Dict:
if time_value_s3[:2] != "20":
# print("file has non-standardized naming")
row = {"file_name": obj.key, "source": source_api, "skip": True, "reason": "file has non-standardized naming"}
dump_json(row)
dump_json(row, source_api)
return dict()

# geo
Expand All @@ -118,7 +118,7 @@ def parse_bucket_info(obj) -> Dict:
# remove work in progress
if 'wip' in signal_s3:
row = {"file_name": obj.key, "source": source_api, "skip": True, "reason": f"wip in signal name"}
dump_json(row)
dump_json(row, source_api)
return dict()
else:
signal_api = signal_s3
Expand Down Expand Up @@ -180,7 +180,7 @@ def check_diff_with_merge(df_s3, df_api):
else:
# print(f"Unsuccessful S3 get_object response. Status - {status}")
row = {"file_name":obj.key, "source":source_api, "skip":True, "reason": f"Unsuccessful S3 get_object response. Status - {status}"}
dump_json(row)
dump_json(row, source_api)
continue


Expand Down Expand Up @@ -219,25 +219,28 @@ def check_diff_with_merge(df_s3, df_api):
"api_row_count": num_df_latest,
"skip":False
}
dump_json(row)
dump_json(row, source_api)
else:
csv_file_split = str(obj.key).split("/")
Path(f'{CSV_PATH}/{csv_file_split[0]}').mkdir(parents=True, exist_ok=True)
diff.to_csv(f'{CSV_PATH}/{str(obj.key)}', index=False)
diff_w_merge = check_diff_with_merge(df_s3=df_s3, df_api=df_latest)
if not diff_w_merge.empty:
diff_w_merge.to_csv(f'{CSV_PATH}/{csv_file_split[0]}/joined_{csv_file_split[1]}', index=False)
diff_w_merge.to_csv(f'{CSV_PATH}/{csv_file_split[0]}/joined_{csv_file_split[1]}', index=False)
diff = {
"num_rows":number_of_dif,
"s3_nan_row_count": int(diff_w_merge["val_s3"].isna().sum()),
"api_nan_row_count": int(diff_w_merge["val_api"].isna().sum()),
}
row = {
"file_name":obj.key,
"source":source_api,
"signal":signal_api,
"time_value":time_value_s3,
"geo_type":geo_s3,
"dif_row_count":number_of_dif,
"s3_row_count": num_df_s3,
"api_row_count": num_df_latest,
"full_dif":full_file_dif_potential,
"full_diff":full_file_dif_potential,
"diff": diff,
"skip":False,
}
dump_json(row)
dump_json(row, source_api)
full_file_dif_potential = False

0 comments on commit 7e1f53e

Please sign in to comment.