Skip to content

Commit

Permalink
Update web100 extended views to use web100_static table (#137)
Browse files Browse the repository at this point in the history
* Update web100 extended views to use web100_static table
  • Loading branch information
stephen-soltesz authored Apr 28, 2022
1 parent 5a00291 commit 1d19ad9
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 179 deletions.
144 changes: 53 additions & 91 deletions views/ndt_intermediate/extended_web100_downloads.sql
Original file line number Diff line number Diff line change
Expand Up @@ -10,45 +10,43 @@

WITH PreCleanWeb100 AS (
SELECT
-- NOTE: we name the partition_date to test_date to prevent exposing
-- implementation details that are expected to change.
partition_date AS date,
*,
web100_log_entry.snap.Duration AS connection_duration, -- SYN to FIN total time
(web100_log_entry.snap.SndLimTimeRwin +
web100_log_entry.snap.SndLimTimeCwnd +
web100_log_entry.snap.SndLimTimeSnd) AS measurement_duration, -- Time transfering data
(blacklist_flags IS NOT NULL and blacklist_flags != 0
OR anomalies.blacklist_flags IS NOT NULL ) AS IsErrored,
(web100_log_entry.connection_spec.remote_ip IN
raw.web100.snap.Duration AS connection_duration, -- SYN to FIN total time
(raw.web100.snap.SndLimTimeRwin +
raw.web100.snap.SndLimTimeCwnd +
raw.web100.snap.SndLimTimeSnd) AS measurement_duration, -- Time transfering data
-- TODO: restore when blacklist flags (or alternate name) is restored.
-- (blacklist_flags IS NOT NULL and blacklist_flags != 0
-- OR anomalies.blacklist_flags IS NOT NULL ) AS IsErrored,
(raw.web100.connection_spec.remote_ip IN
("45.56.98.222", "35.192.37.249", "35.225.75.192", "23.228.128.99",
"2600:3c03::f03c:91ff:fe33:819", "2605:a601:f1ff:fffe::99")
OR (NET.IP_TRUNC(NET.SAFE_IP_FROM_STRING(web100_log_entry.connection_spec.local_ip),
OR (NET.IP_TRUNC(NET.SAFE_IP_FROM_STRING(raw.web100.connection_spec.local_ip),
8) = NET.IP_FROM_STRING("10.0.0.0"))
OR (NET.IP_TRUNC(NET.SAFE_IP_FROM_STRING(web100_log_entry.connection_spec.local_ip),
OR (NET.IP_TRUNC(NET.SAFE_IP_FROM_STRING(raw.web100.connection_spec.local_ip),
12) = NET.IP_FROM_STRING("172.16.0.0"))
OR (NET.IP_TRUNC(NET.SAFE_IP_FROM_STRING(web100_log_entry.connection_spec.local_ip),
OR (NET.IP_TRUNC(NET.SAFE_IP_FROM_STRING(raw.web100.connection_spec.local_ip),
16) = NET.IP_FROM_STRING("192.168.0.0"))
OR REGEXP_EXTRACT(task_filename, '(mlab[1-4])-[a-z][a-z][a-z][0-9][0-9t]') = 'mlab4'
OR REGEXP_EXTRACT(parser.ArchiveURL, '(mlab[1-4])-[a-z][a-z][a-z][0-9][0-9t]') = 'mlab4'
) AS IsOAM, -- Data is not from valid clients
web100_log_entry.snap.OctetsRetrans > 0 AS IsCongested,
( web100_log_entry.snap.SmoothedRTT > 2*web100_log_entry.snap.MinRTT AND
web100_log_entry.snap.SmoothedRTT > 1000 ) AS IsBloated,
raw.web100.snap.OctetsRetrans > 0 AS IsCongested,
( raw.web100.snap.SmoothedRTT > 2*raw.web100.snap.MinRTT AND
raw.web100.snap.SmoothedRTT > 1000 ) AS IsBloated,
STRUCT (
parser_version AS Version,
parse_time AS Time,
task_filename AS ArchiveURL,
"web100" AS Filename
parser.Version,
parser.Time,
parser.ArchiveURL,
parser.Filename
) AS Web100parser,
FROM `{{.ProjectID}}.ndt_raw.web100_legacy` -- TODO move to intermediate_ndt
FROM `{{.ProjectID}}.ndt.web100_static`
WHERE
web100_log_entry.snap.Duration IS NOT NULL
AND web100_log_entry.snap.State IS NOT NULL
AND web100_log_entry.connection_spec.local_ip IS NOT NULL
AND web100_log_entry.connection_spec.remote_ip IS NOT NULL
AND web100_log_entry.snap.SndLimTimeRwin IS NOT NULL
AND web100_log_entry.snap.SndLimTimeCwnd IS NOT NULL
AND web100_log_entry.snap.SndLimTimeSnd IS NOT NULL
raw.web100.snap.Duration IS NOT NULL
AND raw.web100.snap.State IS NOT NULL
AND raw.web100.connection_spec.local_ip IS NOT NULL
AND raw.web100.connection_spec.remote_ip IS NOT NULL
AND raw.web100.snap.SndLimTimeRwin IS NOT NULL
AND raw.web100.snap.SndLimTimeCwnd IS NOT NULL
AND raw.web100.snap.SndLimTimeSnd IS NOT NULL
),

Web100DownloadModels AS (
Expand All @@ -58,87 +56,51 @@ Web100DownloadModels AS (
-- Struct a models various TCP behaviors
STRUCT(
id as UUID,
log_time AS TestTime,
a.TestTime,
"reno" AS CongestionControl,
web100_log_entry.snap.HCThruOctetsAcked * 8.0 / measurement_duration AS MeanThroughputMbps,
web100_log_entry.snap.MinRTT * 1.0 AS MinRTT,
SAFE_DIVIDE(web100_log_entry.snap.SegsRetrans, web100_log_entry.snap.SegsOut) AS LossRate
raw.web100.snap.HCThruOctetsAcked * 8.0 / measurement_duration AS MeanThroughputMbps,
raw.web100.snap.MinRTT * 1.0 AS MinRTT,
SAFE_DIVIDE(raw.web100.snap.SegsRetrans, raw.web100.snap.SegsOut) AS LossRate
) AS a,
STRUCT (
"web100" AS _Instruments -- THIS WILL CHANGE
) AS node,
-- Struct filter has predicates for various cleaning assumptions
STRUCT (
( -- Download only, >8kB transfered, 9-60 seconds, network bottlenck
NOT IsOAM AND NOT IsErrored
AND connection_spec.data_direction IS NOT NULL
AND connection_spec.data_direction = 1
AND web100_log_entry.snap.HCThruOctetsAcked IS NOT NULL
AND web100_log_entry.snap.HCThruOctetsAcked >= 8192
NOT IsOAM -- AND NOT IsErrored
AND raw.connection.data_direction IS NOT NULL
AND raw.connection.data_direction = 1
AND raw.web100.snap.HCThruOctetsAcked IS NOT NULL
AND raw.web100.snap.HCThruOctetsAcked >= 8192
AND measurement_duration BETWEEN 9000000 AND 60000000
AND (IsCongested OR IsBloated)
) AS IsValidBest,
( -- Download only, >kB transfered, 9-60 seconds, network bottlenck
NOT IsOAM AND NOT IsErrored
AND connection_spec.data_direction IS NOT NULL
AND connection_spec.data_direction = 1
AND web100_log_entry.snap.HCThruOctetsAcked IS NOT NULL
AND web100_log_entry.snap.HCThruOctetsAcked >= 8192
NOT IsOAM -- AND NOT IsErrored
AND raw.connection.data_direction IS NOT NULL
AND raw.connection.data_direction = 1
AND raw.web100.snap.HCThruOctetsAcked IS NOT NULL
AND raw.web100.snap.HCThruOctetsAcked >= 8192
AND measurement_duration BETWEEN 9000000 AND 60000000
AND (IsCongested) -- Does not include buffer bloat
) AS IsValid2019
) AS filter,
STRUCT (
web100_log_entry.connection_spec.remote_ip AS IP,
web100_log_entry.connection_spec.remote_port AS Port,
-- TODO(https://github.com/m-lab/etl/issues/1069): eliminate region mask once parser does this.
STRUCT(
connection_spec.ClientX.Geo.ContinentCode,
connection_spec.ClientX.Geo.CountryCode,
connection_spec.ClientX.Geo.CountryCode3,
connection_spec.ClientX.Geo.CountryName,
CAST(NULL as STRING) as Region, -- mask out region.
connection_spec.ClientX.Geo.Subdivision1ISOCode,
connection_spec.ClientX.Geo.Subdivision1Name,
connection_spec.ClientX.Geo.Subdivision2ISOCode,
connection_spec.ClientX.Geo.Subdivision2Name,
connection_spec.ClientX.Geo.MetroCode,
connection_spec.ClientX.Geo.City,
connection_spec.ClientX.Geo.AreaCode,
connection_spec.ClientX.Geo.PostalCode,
connection_spec.ClientX.Geo.Latitude,
connection_spec.ClientX.Geo.Longitude,
connection_spec.ClientX.Geo.AccuracyRadiusKm,
connection_spec.ClientX.Geo.Missing
) AS Geo,
connection_spec.ClientX.Network
-- TODO(soltesz): eliminate ip / port from server/client records.
raw.web100.connection_spec.remote_ip AS IP,
raw.web100.connection_spec.remote_port AS Port,
client.Geo,
client.Network
) AS client,
STRUCT (
web100_log_entry.connection_spec.local_ip AS IP,
web100_log_entry.connection_spec.local_port AS Port,
connection_spec.ServerX.Site,
connection_spec.ServerX.Machine,
-- TODO(https://github.com/m-lab/etl/issues/1069): eliminate region mask once parser does this.
STRUCT(
connection_spec.ServerX.Geo.ContinentCode,
connection_spec.ServerX.Geo.CountryCode,
connection_spec.ServerX.Geo.CountryCode3,
connection_spec.ServerX.Geo.CountryName,
CAST(NULL as STRING) as Region, -- mask out region.
connection_spec.ServerX.Geo.Subdivision1ISOCode,
connection_spec.ServerX.Geo.Subdivision1Name,
connection_spec.ServerX.Geo.Subdivision2ISOCode,
connection_spec.ServerX.Geo.Subdivision2Name,
connection_spec.ServerX.Geo.MetroCode,
connection_spec.ServerX.Geo.City,
connection_spec.ServerX.Geo.AreaCode,
connection_spec.ServerX.Geo.PostalCode,
connection_spec.ServerX.Geo.Latitude,
connection_spec.ServerX.Geo.Longitude,
connection_spec.ServerX.Geo.AccuracyRadiusKm,
connection_spec.ServerX.Geo.Missing
) AS Geo,
connection_spec.ServerX.Network
-- TODO(soltesz): eliminate ip / port from server/client records.
raw.web100.connection_spec.local_ip AS IP,
raw.web100.connection_spec.local_port AS Port,
server.Site,
server.Machine,
server.Geo,
server.Network
) AS server,
PreCleanWeb100 AS _internal202010 -- Not stable and subject to breaking changes
FROM PreCleanWeb100
Expand Down
138 changes: 50 additions & 88 deletions views/ndt_intermediate/extended_web100_uploads.sql
Original file line number Diff line number Diff line change
Expand Up @@ -10,45 +10,43 @@

WITH PreCleanWeb100 AS (
SELECT
-- NOTE: we name the partition_date to test_date to prevent exposing
-- implementation details that are expected to change.
partition_date AS date,
*,
web100_log_entry.snap.Duration AS connection_duration, -- SYN to FIN total time
IF(web100_log_entry.snap.Duration > 12000000, /* 12 sec */
web100_log_entry.snap.Duration - 2000000,
web100_log_entry.snap.Duration) AS measurement_duration, -- Time transfering data
(blacklist_flags IS NOT NULL and blacklist_flags != 0
OR anomalies.blacklist_flags IS NOT NULL ) AS IsErrored,
(web100_log_entry.connection_spec.remote_ip IN
raw.web100.snap.Duration AS connection_duration, -- SYN to FIN total time
IF(raw.web100.snap.Duration > 12000000, /* 12 sec */
raw.web100.snap.Duration - 2000000,
raw.web100.snap.Duration) AS measurement_duration, -- Time transfering data
-- TODO: restore when blacklist flags (or alternate name) is restored.
-- (blacklist_flags IS NOT NULL and blacklist_flags != 0
-- OR anomalies.blacklist_flags IS NOT NULL ) AS IsErrored,
(raw.web100.connection_spec.remote_ip IN
("45.56.98.222", "35.192.37.249", "35.225.75.192", "23.228.128.99",
"2600:3c03::f03c:91ff:fe33:819", "2605:a601:f1ff:fffe::99")
OR (NET.IP_TRUNC(NET.SAFE_IP_FROM_STRING(web100_log_entry.connection_spec.local_ip),
OR (NET.IP_TRUNC(NET.SAFE_IP_FROM_STRING(raw.web100.connection_spec.local_ip),
8) = NET.IP_FROM_STRING("10.0.0.0"))
OR (NET.IP_TRUNC(NET.SAFE_IP_FROM_STRING(web100_log_entry.connection_spec.local_ip),
OR (NET.IP_TRUNC(NET.SAFE_IP_FROM_STRING(raw.web100.connection_spec.local_ip),
12) = NET.IP_FROM_STRING("172.16.0.0"))
OR (NET.IP_TRUNC(NET.SAFE_IP_FROM_STRING(web100_log_entry.connection_spec.local_ip),
OR (NET.IP_TRUNC(NET.SAFE_IP_FROM_STRING(raw.web100.connection_spec.local_ip),
16) = NET.IP_FROM_STRING("192.168.0.0"))
OR REGEXP_EXTRACT(task_filename, '(mlab[1-4])-[a-z][a-z][a-z][0-9][0-9t]') = 'mlab4'
OR REGEXP_EXTRACT(parser.ArchiveURL, '(mlab[1-4])-[a-z][a-z][a-z][0-9][0-9t]') = 'mlab4'
) AS IsOAM, -- Data is not from valid clients
( -- Eliminate some clearly bogus data
web100_log_entry.snap.HCThruOctetsReceived > 1E14 -- approximately 10Gb/s for 24 hours
raw.web100.snap.HCThruOctetsReceived > 1E14 -- approximately 10Gb/s for 24 hours
) AS IsCorrupted,
STRUCT (
parser_version AS Version,
parse_time AS Time,
task_filename AS ArchiveURL,
"web100" AS Filename
parser.Version,
parser.Time,
parser.ArchiveURL,
parser.Filename
) AS Web100parser,
FROM `{{.ProjectID}}.ndt_raw.web100_legacy` -- TODO move to intermediate_ndt
FROM `{{.ProjectID}}.ndt.web100_static`
WHERE
web100_log_entry.snap.Duration IS NOT NULL
AND web100_log_entry.snap.State IS NOT NULL
AND web100_log_entry.connection_spec.local_ip IS NOT NULL
AND web100_log_entry.connection_spec.remote_ip IS NOT NULL
AND web100_log_entry.snap.SndLimTimeRwin IS NOT NULL
AND web100_log_entry.snap.SndLimTimeCwnd IS NOT NULL
AND web100_log_entry.snap.SndLimTimeSnd IS NOT NULL
raw.web100.snap.Duration IS NOT NULL
AND raw.web100.snap.State IS NOT NULL
AND raw.web100.connection_spec.local_ip IS NOT NULL
AND raw.web100.connection_spec.remote_ip IS NOT NULL
AND raw.web100.snap.SndLimTimeRwin IS NOT NULL
AND raw.web100.snap.SndLimTimeCwnd IS NOT NULL
AND raw.web100.snap.SndLimTimeSnd IS NOT NULL
),

Web100UploadModels AS (
Expand All @@ -58,10 +56,10 @@ Web100UploadModels AS (
-- Struct a models various TCP behaviors
STRUCT(
id as UUID,
log_time AS TestTime,
a.TestTime,
'' AS CongestionControl, -- https://github.com/m-lab/etl-schema/issues/95
web100_log_entry.snap.HCThruOctetsReceived * 8.0 / connection_duration AS MeanThroughputMbps,
web100_log_entry.snap.MinRTT * 1.0 AS MinRTT, -- Note: download side measurement (ms)
raw.web100.snap.HCThruOctetsReceived * 8.0 / connection_duration AS MeanThroughputMbps,
raw.web100.snap.MinRTT * 1.0 AS MinRTT, -- Note: download side measurement (ms)
Null AS LossRate -- Receiver can not measure loss
) AS a,
STRUCT (
Expand All @@ -70,73 +68,37 @@ Web100UploadModels AS (
-- Struct filter has predicates for various cleaning assumptions
STRUCT (
( -- Upload only, >8kB transfered, 9-60 seconds
NOT IsOAM AND NOT IsErrored AND NOT IsCorrupted
AND connection_spec.data_direction IS NOT NULL
AND connection_spec.data_direction = 0
AND web100_log_entry.snap.HCThruOctetsReceived IS NOT NULL
AND web100_log_entry.snap.HCThruOctetsReceived >= 8192
NOT IsOAM -- AND NOT IsErrored AND NOT IsCorrupted
AND raw.connection.data_direction IS NOT NULL
AND raw.connection.data_direction = 0
AND raw.web100.snap.HCThruOctetsReceived IS NOT NULL
AND raw.web100.snap.HCThruOctetsReceived >= 8192
AND connection_duration BETWEEN 9000000 AND 60000000
) AS IsValidBest,
( -- Upload only, >8kB transfered, 9-60 seconds
NOT IsOAM AND NOT IsErrored
AND connection_spec.data_direction IS NOT NULL
AND connection_spec.data_direction = 0
AND web100_log_entry.snap.HCThruOctetsReceived IS NOT NULL
AND web100_log_entry.snap.HCThruOctetsReceived >= 8192
NOT IsOAM -- AND NOT IsErrored
AND raw.connection.data_direction IS NOT NULL
AND raw.connection.data_direction = 0
AND raw.web100.snap.HCThruOctetsReceived IS NOT NULL
AND raw.web100.snap.HCThruOctetsReceived >= 8192
AND connection_duration BETWEEN 9000000 AND 60000000
) AS IsValid2019
) AS filter,
STRUCT (
web100_log_entry.connection_spec.remote_ip AS IP,
web100_log_entry.connection_spec.remote_port AS Port,
-- TODO(https://github.com/m-lab/etl/issues/1069): eliminate region mask once parser does this.
STRUCT(
connection_spec.ClientX.Geo.ContinentCode,
connection_spec.ClientX.Geo.CountryCode,
connection_spec.ClientX.Geo.CountryCode3,
connection_spec.ClientX.Geo.CountryName,
CAST(NULL as STRING) as Region, -- mask out region.
connection_spec.ClientX.Geo.Subdivision1ISOCode,
connection_spec.ClientX.Geo.Subdivision1Name,
connection_spec.ClientX.Geo.Subdivision2ISOCode,
connection_spec.ClientX.Geo.Subdivision2Name,
connection_spec.ClientX.Geo.MetroCode,
connection_spec.ClientX.Geo.City,
connection_spec.ClientX.Geo.AreaCode,
connection_spec.ClientX.Geo.PostalCode,
connection_spec.ClientX.Geo.Latitude,
connection_spec.ClientX.Geo.Longitude,
connection_spec.ClientX.Geo.AccuracyRadiusKm,
connection_spec.ClientX.Geo.Missing
) AS Geo,
connection_spec.ClientX.Network
-- TODO(soltesz): eliminate ip / port from server/client records.
raw.web100.connection_spec.remote_ip AS IP,
raw.web100.connection_spec.remote_port AS Port,
client.Geo,
client.Network
) AS client,
STRUCT (
web100_log_entry.connection_spec.local_ip AS IP,
web100_log_entry.connection_spec.local_port AS Port,
connection_spec.ServerX.Site,
connection_spec.ServerX.Machine,
-- TODO(https://github.com/m-lab/etl/issues/1069): eliminate region mask once parser does this.
STRUCT(
connection_spec.ServerX.Geo.ContinentCode,
connection_spec.ServerX.Geo.CountryCode,
connection_spec.ServerX.Geo.CountryCode3,
connection_spec.ServerX.Geo.CountryName,
CAST(NULL as STRING) as Region, -- mask out region.
connection_spec.ServerX.Geo.Subdivision1ISOCode,
connection_spec.ServerX.Geo.Subdivision1Name,
connection_spec.ServerX.Geo.Subdivision2ISOCode,
connection_spec.ServerX.Geo.Subdivision2Name,
connection_spec.ServerX.Geo.MetroCode,
connection_spec.ServerX.Geo.City,
connection_spec.ServerX.Geo.AreaCode,
connection_spec.ServerX.Geo.PostalCode,
connection_spec.ServerX.Geo.Latitude,
connection_spec.ServerX.Geo.Longitude,
connection_spec.ServerX.Geo.AccuracyRadiusKm,
connection_spec.ServerX.Geo.Missing
) AS Geo,
connection_spec.ServerX.Network
-- TODO(soltesz): eliminate ip / port from server/client records.
raw.web100.connection_spec.local_ip AS IP,
raw.web100.connection_spec.local_port AS Port,
server.Site,
server.Machine,
server.Geo,
server.Network
) AS server,
PreCleanWeb100 AS _internal202010 -- Not stable and subject to breaking changes
FROM PreCleanWeb100
Expand Down

0 comments on commit 1d19ad9

Please sign in to comment.