This repository has been archived by the owner on Jul 7, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
edits.py
136 lines (127 loc) · 4.14 KB
/
edits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import pymongo
import time
client = pymongo.MongoClient()
eventRevisionsCollection = client.wikimedia_history_it.revisions
usersCollection = client.wikimedia_user_metrics.users
PAGE_SIZE = 200000
def add_edits():
print('Starting add_edits', time.time())
for i in range(0, 20):
print('Getting all non-bot users ids', i, time.time())
idsResult = list(usersCollection.aggregate([
{
'$match': {
'is_bot': False
}
},
{
'$sort': { 'id': 1 }
},
{
'$skip': i * PAGE_SIZE
},
{
'$limit': PAGE_SIZE
},
{
'$group': {
'_id': None,
'ids': {
'$push': '$id'
}
}
}
]))
if (len(idsResult) < 1):
break
ids = idsResult[0]['ids']
print('Getted ', len(ids), 'ids')
print('Getted all non-bot users ids', time.time())
print('Getting all data from revisions', time.time())
results = list(eventRevisionsCollection.aggregate([
{
'$match': {
'event_user.id': {
'$in': ids
}
}
}, {
'$group': {
'_id': {
'id': '$event_user.id',
'y': {
'$year': '$event_timestamp'
},
'm': {
'$month': '$event_timestamp'
}
},
'n': {
'$sum': 1
}
}
}, {
'$project': {
'_id': False,
'id': '$_id.id',
'v': [
[
{
'$concat': [
{
'$toString': '$_id.y'
}, '/', {
'$toString': '$_id.m'
}
]
}, '$n'
]
]
}
}, {
'$project': {
'id': 1,
'v': {
'$arrayToObject': '$v'
}
}
}, {
'$group': {
'_id': '$id',
'edits': {
'$mergeObjects': '$v'
}
}
}, {
'$project': {
'_id': False,
'f': {
'id': '$_id'
},
'u': {
'set': {
'events.edit': { 'months': '$edits' }
}
}
}
}
]))
print('Getted all data from revisions', time.time())
print('Converting all data to update queries', time.time())
for r in results:
r['u']['$set'] = r['u'].pop('set')
print('Converted all data to update queries', time.time())
print('Converting all data to update queries bis', time.time())
results = [pymongo.UpdateOne(r['f'], r['u']) for r in results]
print('Getted', len(results), 'results')
print('Converted all data to update queries bis', time.time())
print('Updating all user documents', time.time())
usersCollection.bulk_write(results)
print('Updated all user documents', time.time())
print('Ending add_edits', time.time())
def reset_edits():
print('Starting reset_edits', time.time())
usersCollection.update_many({ 'is_bot': False }, { '$set': { 'events.edit': { 'months': {} } } })
print('End reset_edits', time.time())
reset_edits()
add_edits()