forked from microsoft/Mobius
-
Notifications
You must be signed in to change notification settings - Fork 0
/
FreebaseDeletionsBenchmark.cs
127 lines (103 loc) · 5.25 KB
/
FreebaseDeletionsBenchmark.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using Microsoft.Spark.CSharp.Core;
namespace Microsoft.Spark.CSharp.PerfBenchmark
{
/// <summary>
/// Perf benchmark that users Freebase deletions data
/// This data is licensed under CC-BY license (http://creativecommons.org/licenses/by/2.5)
/// Data is available for downloading : "Freebase Deleted Triples" at https://developers.google.com/freebase
/// Data format - CSV, size - 8 GB uncompressed
/// Columns in the dataset are
/// 1. creation_timestamp (Unix epoch time in milliseconds)
/// 2. creator
/// 3. deletion_timestamp (Unix epoch time in milliseconds)
/// 4. deletor
/// 5. subject (MID)
/// 6. predicate (MID)
/// 7. object (MID/Literal)
/// 8. language_code
///
/// Note: You can add an additional column with any size data, if you want to increase
/// the size for each line.
/// </summary>
class FreebaseDeletionsBenchmark
{
private static readonly Stopwatch stopwatch = new Stopwatch();
[PerfSuite]
internal static void RunRDDLineCount(string[] args)
{
string filePath = args[2].StartsWith(@"hdfs://") ? args[2] : new Uri(args[2]).ToString();
stopwatch.Restart();
var lines = PerfBenchmark.SparkContext.TextFile(filePath);
var count = lines.Count();
stopwatch.Stop();
PerfBenchmark.ExecutionTimeList.Add(stopwatch.Elapsed);
Console.WriteLine("Count of lines {0}. Time elapsed {1}", count, stopwatch.Elapsed);
}
[PerfSuite]
internal static void RunRDDMaxDeletionsByUser(string[] args)
{
string filePath = args[2].StartsWith(@"hdfs://") ? args[2] : new Uri(args[2]).ToString();
stopwatch.Restart();
var lines = PerfBenchmark.SparkContext.TextFile(filePath);
var parsedRows = lines.Map(s => RunRDDMaxDeletionsByUser(s));
var flaggedRows = parsedRows.Filter(s => s.Item1); //select good rows
var selectedDeletions = flaggedRows.Filter(s => s.Item3.Equals(s.Item5)); //select deletions made by same creators
var userDeletions = selectedDeletions.Map(s => new Tuple<string, int>(s.Item3, 1));
var userDeletionCount = userDeletions.ReduceByKey((x, y) => x + y);
var userWithMaxDeletions = userDeletionCount.Fold(new Tuple<string, int>("zerovalue", 0), (kvp1, kvp2) => RunRDDMaxDeletionsByUser(kvp1, kvp2));
stopwatch.Stop();
PerfBenchmark.ExecutionTimeList.Add(stopwatch.Elapsed);
Console.WriteLine("User with max deletions is {0}, count of deletions={1}. Elapsed time={2}", userWithMaxDeletions.Item1, userWithMaxDeletions.Item2, stopwatch.Elapsed);
}
public static Tuple<bool, string, string, string, string> RunRDDMaxDeletionsByUser(String s)
{
var columns = s.Split(new[] { ',' });
//data has some bad records - use bool flag to indicate corrupt rows
if (columns.Length > 4)
return new Tuple<bool, string, string, string, string>(true, columns[0], columns[1], columns[2], columns[3]);
else
return new Tuple<bool, string, string, string, string>(false, "X", "X", "X", "X"); //invalid row placeholder
}
public static Tuple<String, int> RunRDDMaxDeletionsByUser(Tuple<String, int> kvp1, Tuple<String, int> kvp2)
{
if (kvp1.Item2 > kvp2.Item2)
return kvp1;
else
return kvp2;
}
[PerfSuite]
internal static void RunDFLineCount(string[] args)
{
stopwatch.Restart();
var rows = PerfBenchmark.SqlContext.TextFile(args[2]);
var rowCount = rows.Count();
stopwatch.Stop();
PerfBenchmark.ExecutionTimeList.Add(stopwatch.Elapsed);
Console.WriteLine("Count of rows {0}. Time elapsed {1}", rowCount, stopwatch.Elapsed);
}
[PerfSuite]
internal static void RunDFMaxDeletionsByUser(string[] args)
{
stopwatch.Restart();
var rows = PerfBenchmark.SqlContext.TextFile(args[2]);
var filtered = rows.Filter("_c1 = _c3");
var aggregated = filtered.GroupBy("_c1").Agg(new Dictionary<string, string> { { "_c1", "count" } });
aggregated.RegisterTempTable("freebasedeletions");
var max = PerfBenchmark.SqlContext.Sql("select max(`count(_c1)`) from freebasedeletions");
var maxArray = max.Collect();
var maxValue = maxArray.First();
var maxDeletions = PerfBenchmark.SqlContext.Sql("select * from freebasedeletions where `count(_c1)` = " + maxValue.Get(0));
maxDeletions.Show();
//TODO - add perf suite for subquery
stopwatch.Stop();
PerfBenchmark.ExecutionTimeList.Add(stopwatch.Elapsed);
Console.WriteLine("User with max deletions & count of deletions is listed above. Time elapsed {0}", stopwatch.Elapsed);
}
}
}