Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use name normalization method #1132

Merged
merged 8 commits into from
Dec 13, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 59 additions & 7 deletions share/patch/patch_db_zonemaster_backend_ver_11.0.3.pl
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,25 @@
);

my $db_engine = $config->DB_engine;
print "engine: $db_engine\n";
print "Configured database engine: $db_engine\n";

if ( $db_engine =~ /^(MySQL|PostgreSQL|SQLite)$/ ) {
print( "Starting database migration\n" );
$patch{ lc $db_engine }();
print( "\nMigration done\n" );
}
else {
die "Unknown database engine configured: $db_engine\n";
}

sub _update_data {
sub _update_data_result_entries {
my ( $dbh ) = @_;

my $json = JSON::PP->new->allow_blessed->convert_blessed->canonical;

# update only jobs with results
my ( $row_total ) = $dbh->selectrow_array( 'SELECT count(*) FROM test_results WHERE results IS NOT NULL' );
print "count: $row_total\n";
print "Will update $row_total rows\n";

my %levels = Zonemaster::Engine::Logger::Entry->levels();

Expand All @@ -40,7 +42,7 @@ sub _update_data {
my $row_count = 50000;
my $row_done = 0;
while ( $row_done < $row_total ) {
print "row_done/row_total: $row_done / $row_total\n";
print "Progress update: $row_done / $row_total\n";
my $row_updated = 0;
my $sth1 = $dbh->prepare( 'SELECT hash_id, results FROM test_results WHERE results IS NOT NULL ORDER BY id ASC LIMIT ?,?' );
$sth1->execute( $row_done, $row_count );
Expand Down Expand Up @@ -77,7 +79,43 @@ sub _update_data {
# increase by min(row_updated, row_count)
$row_done += ( $row_updated < $row_count ) ? $row_updated : $row_count;
}
print "row_done/row_total: $row_done / $row_total\n";
print "Progress update: $row_done / $row_total\n";
}

sub _update_data_nomalize_domains {
my ( $db ) = @_;

my ( $row_total ) = $db->dbh->selectrow_array( 'SELECT count(*) FROM test_results' );
print "Will update $row_total rows\n";


my $sth1 = $db->dbh->prepare( 'SELECT hash_id, params FROM test_results' );
$sth1->execute;

my $row_done = 0;
my $progress = 0;

while ( my $row = $sth1->fetchrow_hashref ) {
my $hash_id = $row->{hash_id};
my $raw_params = decode_json($row->{params});
my $domain = $raw_params->{domain};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're extracting domain from params and putting in its own field, but we're still keeping a copy inside params. Is this intentional? Should this be further cleaned up in the future?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The domain is still in the params, whether it could / should be removed is a good question, but outside of this PR.

Here I am taking the domain from the params instead its own field for normalization to avoid encoding issues.


# This has never been cleaned
delete $raw_params->{user_ip};

my $params = $db->encode_params( $raw_params );
my $fingerprint = $db->generate_fingerprint( $raw_params );

$domain = Zonemaster::Backend::DB::_normalize_domain( $domain );

$db->dbh->do('UPDATE test_results SET domain = ?, params = ?, fingerprint = ? where hash_id = ?', undef, $domain, $params, $fingerprint, $hash_id);
$row_done += 1;
my $new_progress = int(($row_done / $row_total) * 100);
if ( $new_progress != $progress ) {
$progress = $new_progress;
print("$progress%\n");
}
}
}

sub patch_db_mysql {
Expand All @@ -91,7 +129,11 @@ sub patch_db_mysql {
try {
$db->create_schema();

_update_data( $dbh );
print( "\n-> (1/2) Populating new result_entries table\n" );
_update_data_result_entries( $dbh );

print( "\n-> (2/2) Normalizing domain names\n" );
_update_data_nomalize_domains( $db );

$dbh->commit();
} catch {
Expand All @@ -112,6 +154,8 @@ sub patch_db_postgresql {
try {
$db->create_schema();

print( "\n-> (1/2) Populating new result_entries table\n" );

$dbh->do(q[
INSERT INTO result_entries (
hash_id, args, module, level, tag, timestamp, testcase
Expand Down Expand Up @@ -139,6 +183,10 @@ sub patch_db_postgresql {
'UPDATE test_results SET results = NULL WHERE results IS NOT NULL'
);


print( "\n-> (2/2) Normalizing domain names\n" );
_update_data_nomalize_domains( $db );

$dbh->commit();
} catch {
print( "Could not upgrade database: " . $_ );
Expand All @@ -158,7 +206,11 @@ sub patch_db_sqlite {
try {
$db->create_schema();

_update_data( $dbh );
print( "\n-> (1/2) Populating new result_entries table\n" );
_update_data_result_entries( $dbh );

print( "\n-> (2/2) Normalizing domain names\n" );
_update_data_nomalize_domains( $db );

$dbh->commit();
} catch {
Expand Down