-
Notifications
You must be signed in to change notification settings - Fork 0
/
91_Find_Fail_Gen_Pig_Script.py
34 lines (24 loc) · 4.41 KB
/
91_Find_Fail_Gen_Pig_Script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import os.path
# E:\tweet_process\result_follower-ret\04_each_hour\aroii\fold_1
# E:\tweet_process\result_follower-ret\05_pig_script\aroii
# start_hour = [600, 700, 800] # end = start + 100
start_hour = [300, 400, 500] # end = start + 100
end_hour = 100
# fold_fail = [3, 5]
fold_fail = [2]
# topic = "aroii"
# topic = "hormonestheseries"
# topic = "apple"
topic = "thefacethailand"
for fold in fold_fail:
for start_index in range(0, len(start_hour)):
script_path = "E:/tweet_process/result_follower-ret/091_specific_pig_script/" + topic + "/script_" + topic + "_fold_" + str(fold) + "_" + str(start_hour[start_index]) + "_" + str(start_hour[start_index] + end_hour) + ".pig"
fo = open(script_path, "w")
for i in range(start_hour[start_index], start_hour[start_index] + end_hour):
check_input_path = "E:/tweet_process/result_follower-ret/04_each_hour/" + topic + "/fold_" + str(fold) + "/t" + str(i) + ".csv"
is_file = os.path.isfile(check_input_path)
if is_file:
print(check_input_path)
code = "csv_file_" + str(i) + " = LOAD '" + topic + "_fold_" + str(fold) + "/t" + str(i) + ".csv' USING PigStorage(',') AS (index_hour_" + str(i) + ":int, message_count_" + str(i) + ":double, index_line_" + str(i) + ":int, is_retweet_" + str(i) + ":int, is_quote_" + str(i) + ":int, original_tweet_date_" + str(i) + ":chararray, original_tweet_time_" + str(i) + ":chararray, original_tweet_user_follower_count_" + str(i) + ":double, original_tweet_retweet_count_" + str(i) + ":double, original_tweet_id_" + str(i) + ":chararray, original_tweet_user_id_" + str(i) + ":chararray, retweet_tweet_date_" + str(i) + ":chararray, retweet_tweet_time_" + str(i) + ":chararray, retweet_tweet_id_" + str(i) + ":chararray, retweet_tweet_user_id_" + str(i) + ":chararray, original_tweet_epoch_" + str(i) + ":int, retweet_tweet_epoch_" + str(i) + ":int);\n" + "grouped_by_original_tweet_id_" + str(i) + " = GROUP csv_file_" + str(i) + " BY original_tweet_id_" + str(i) + ";\n" + "cal_diff_of_each_tweet_" + str(i) + " = FOREACH grouped_by_original_tweet_id_" + str(i) + " {\n" + "sort_end_" + str(i) + " = ORDER csv_file_" + str(i) + " BY retweet_tweet_epoch_" + str(i) + " DESC;\n" + "retweet_at_end_" + str(i) + " = LIMIT sort_end_" + str(i) + " 1;\n" + "sort_start_" + str(i) + " = ORDER csv_file_" + str(i) + " BY retweet_tweet_epoch_" + str(i) + " ASC;\n" + "retweet_at_start_" + str(i) + " = LIMIT sort_start_" + str(i) + " 1;\n" + "start_retweet_" + str(i) + " = SUM(retweet_at_start_" + str(i) + ".original_tweet_retweet_count_" + str(i) + ");\n" + "end_retweet_" + str(i) + " = SUM(retweet_at_end_" + str(i) + ".original_tweet_retweet_count_" + str(i) + ");\n" + "diff_retweet_" + str(i) + " = end_retweet_" + str(i) + "-start_retweet_" + str(i) + ";\n" + "start_follower_" + str(i) + " = SUM(retweet_at_start_" + str(i) + ".original_tweet_user_follower_count_" + str(i) + ");\n" + "end_follower_" + str(i) + " = SUM(retweet_at_end_" + str(i) + ".original_tweet_user_follower_count_" + str(i) + ");\n" + "start_message_count_" + str(i) + " = SUM(retweet_at_start_" + str(i) + ".message_count_" + str(i) + ");\n" + "end_message_count_" + str(i) + " = SUM(retweet_at_end_" + str(i) + ".message_count_" + str(i) + ");\n" + "end_part_" + str(i) + " = end_follower_" + str(i) + " / end_message_count_" + str(i) + ";\n" + "start_part_" + str(i) + " = start_follower_" + str(i) + " / start_message_count_" + str(i) + ";\n" + "diff_follower_" + str(i) + " = end_part_" + str(i) + " - start_part_" + str(i) + ";\n" + "owner_id_" + str(i) + " = retweet_at_start_" + str(i) + ".original_tweet_user_id_" + str(i) + ";\n" + "hour_" + str(i) + " = retweet_at_start_" + str(i) + ".index_hour_" + str(i) + ";\n" + "diff_follower_original_" + str(i) + " = end_follower_" + str(i) + " - start_follower_" + str(i) + ";\n" + "GENERATE group, start_retweet_" + str(i) + ", end_retweet_" + str(i) + ", diff_retweet_" + str(i) + ", end_follower_" + str(i) + ", end_message_count_" + str(i) + ", start_follower_" + str(i) + ", start_message_count_" + str(i) + ", end_part_" + str(i) + ", start_part_" + str(i) + ", diff_follower_" + str(i) + ", FLATTEN(owner_id_" + str(i) + "), FLATTEN(hour_" + str(i) + "), diff_follower_original_" + str(i) + ";\n" + "};\n" + "STORE cal_diff_of_each_tweet_" + str(i) + " INTO 'result_t" + str(i) + "' USING PigStorage(',');\n"
fo.write(code)
fo.close()