Skip to content

Commit

Permalink
Move tenant & timeline dir method to NeonPageserver and use them ever…
Browse files Browse the repository at this point in the history
…ywhere (#5262)

## Problem
In many places in test code, paths are built manually from what
NeonEnv.tenant_dir and NeonEnv.timeline_dir could do.

## Summary of changes
1. NeonEnv.tenant_dir and NeonEnv.timeline_dir moved under class
NeonPageserver as the path they use is per-pageserver instance.
2. Used these everywhere to replace manual path building

Closes #5258

---------

Signed-off-by: Rahul Modpur <[email protected]>
  • Loading branch information
rmodpur authored Sep 15, 2023
1 parent e400a38 commit e6985bd
Show file tree
Hide file tree
Showing 18 changed files with 68 additions and 94 deletions.
27 changes: 15 additions & 12 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -847,18 +847,6 @@ def get_safekeeper_connstrs(self) -> str:
"""Get list of safekeeper endpoints suitable for safekeepers GUC"""
return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)

def timeline_dir(
self, tenant_id: TenantId, timeline_id: TimelineId, pageserver_id: Optional[int] = None
) -> Path:
"""Get a timeline directory's path based on the repo directory of the test environment"""
return (
self.tenant_dir(tenant_id, pageserver_id=pageserver_id) / "timelines" / str(timeline_id)
)

def tenant_dir(self, tenant_id: TenantId, pageserver_id: Optional[int] = None) -> Path:
"""Get a tenant directory's path based on the repo directory of the test environment"""
return self.get_pageserver(pageserver_id).workdir / "tenants" / str(tenant_id)

def get_pageserver_version(self) -> str:
bin_pageserver = str(self.neon_binpath / "pageserver")
res = subprocess.run(
Expand Down Expand Up @@ -1580,6 +1568,21 @@ def __init__(
'.*registered custom resource manager "neon".*',
]

def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
"""Get a timeline directory's path based on the repo directory of the test environment"""
if timeline_id is None:
return self.tenant_dir(tenant_id) / "timelines"
return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)

def tenant_dir(
self,
tenant_id: Optional[TenantId] = None,
) -> Path:
"""Get a tenant directory's path based on the repo directory of the test environment"""
if tenant_id is None:
return self.workdir / "tenants"
return self.workdir / "tenants" / str(tenant_id)

def start(
self,
overrides: Tuple[str, ...] = (),
Expand Down
2 changes: 1 addition & 1 deletion test_runner/performance/test_bulk_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def measure_recovery_time(env: NeonCompare):

# Stop pageserver and remove tenant data
env.env.pageserver.stop()
timeline_dir = env.env.timeline_dir(env.tenant, env.timeline)
timeline_dir = env.env.pageserver.timeline_dir(env.tenant, env.timeline)
shutil.rmtree(timeline_dir)

# Start pageserver
Expand Down
4 changes: 2 additions & 2 deletions test_runner/regress/test_broken_timeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)

tenant_id = env.initial_tenant

timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
timelines_dir = env.pageserver.timeline_dir(tenant_id)
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]

Expand Down Expand Up @@ -166,7 +166,7 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde

tenant_id = env.initial_tenant

timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
timelines_dir = env.pageserver.timeline_dir(tenant_id)
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]

Expand Down
2 changes: 1 addition & 1 deletion test_runner/regress/test_disk_usage_eviction.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def poor_mans_du(
largest_layer = 0
smallest_layer = None
for tenant_id, timeline_id in timelines:
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}"
total = 0
for file in timeline_dir.iterdir():
Expand Down
2 changes: 1 addition & 1 deletion test_runner/regress/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def _import(
env.endpoints.stop_all()
env.pageserver.stop()

dir_to_clear = Path(env.pageserver.workdir) / "tenants"
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)

Expand Down
6 changes: 3 additions & 3 deletions test_runner/regress/test_layer_eviction.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_basic_eviction(
for sk in env.safekeepers:
sk.stop()

timeline_path = env.timeline_dir(tenant_id, timeline_id)
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
initial_local_layers = sorted(
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
)
Expand Down Expand Up @@ -243,15 +243,15 @@ def tenant_update_config(changes):
assert by_kind["Image"] > 0
assert by_kind["Delta"] > 0
assert by_kind["InMemory"] == 0
resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
log.info("resident layers count before eviction: %s", len(resident_layers))

log.info("evict all layers")
ps_http.evict_all_layers(tenant_id, timeline_id)

def ensure_resident_and_remote_size_metrics():
log.info("ensure that all the layers are gone")
resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
# we have disabled all background loops, so, this should hold
assert len(resident_layers) == 0

Expand Down
4 changes: 2 additions & 2 deletions test_runner/regress/test_layer_writers_fail.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_image_layer_writer_fail_before_finish(neon_simple_env: NeonEnv):
new_temp_layer_files = list(
filter(
lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX),
[path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()],
[path for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir()],
)
)

Expand Down Expand Up @@ -84,7 +84,7 @@ def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv):
new_temp_layer_files = list(
filter(
lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX),
[path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()],
[path for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir()],
)
)

Expand Down
7 changes: 3 additions & 4 deletions test_runner/regress/test_ondemand_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import time
from collections import defaultdict
from pathlib import Path
from typing import Any, DefaultDict, Dict, Tuple

import pytest
Expand Down Expand Up @@ -115,7 +114,7 @@ def test_ondemand_download_large_rel(
env.pageserver.stop()

# remove all the layer files
for layer in (Path(env.pageserver.workdir) / "tenants").glob("*/timelines/*/*-*_*"):
for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"):
log.info(f"unlinking layer {layer}")
layer.unlink()

Expand Down Expand Up @@ -237,7 +236,7 @@ def get_resident_physical_size():
env.pageserver.stop()

# remove all the layer files
for layer in (Path(env.pageserver.workdir) / "tenants").glob("*/timelines/*/*-*_*"):
for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"):
log.info(f"unlinking layer {layer}")
layer.unlink()

Expand Down Expand Up @@ -368,7 +367,7 @@ def get_resident_physical_size():

# remove all the layer files
# XXX only delete some of the layer files, to show that it really just downloads all the layers
for layer in (Path(env.pageserver.workdir) / "tenants").glob("*/timelines/*/*-*_*"):
for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"):
log.info(f"unlinking layer {layer.name}")
layer.unlink()

Expand Down
13 changes: 7 additions & 6 deletions test_runner/regress/test_remote_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import shutil
import threading
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import pytest
Expand Down Expand Up @@ -137,7 +136,7 @@ def test_remote_storage_backup_and_restore(
env.endpoints.stop_all()
env.pageserver.stop()

dir_to_clear = Path(env.pageserver.workdir) / "tenants"
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)

Expand Down Expand Up @@ -353,7 +352,7 @@ def churn_while_failpoints_active(result):
env.pageserver.stop(immediate=True)
env.endpoints.stop_all()

dir_to_clear = Path(env.pageserver.workdir) / "tenants"
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)

Expand Down Expand Up @@ -488,7 +487,7 @@ def churn(data_pass1, data_pass2):
env.pageserver.stop(immediate=True)
env.endpoints.stop_all()

dir_to_clear = Path(env.pageserver.workdir) / "tenants"
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)

Expand Down Expand Up @@ -533,7 +532,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline

timeline_path = env.timeline_dir(tenant_id, timeline_id)
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)

client = env.pageserver.http_client()

Expand Down Expand Up @@ -704,7 +703,9 @@ def test_empty_branch_remote_storage_upload_on_restart(
# index upload is now hitting the failpoint, it should block the shutdown
env.pageserver.stop(immediate=True)

local_metadata = env.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
local_metadata = (
env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
)
assert local_metadata.is_file()

assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
Expand Down
2 changes: 1 addition & 1 deletion test_runner/regress/test_tenant_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):

# tenant is created with defaults, as in without config file
(tenant_id, timeline_id) = env.neon_cli.create_tenant()
config_path = env.pageserver.workdir / "tenants" / str(tenant_id) / "config"
config_path = env.pageserver.tenant_dir(tenant_id) / "config"
assert config_path.exists(), "config file is always initially created"

http_client = env.pageserver.http_client()
Expand Down
8 changes: 4 additions & 4 deletions test_runner/regress/test_tenant_delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_tenant_delete_smoke(

tenant_delete_wait_completed(ps_http, tenant_id, iterations)

tenant_path = env.tenant_dir(tenant_id=tenant_id)
tenant_path = env.pageserver.tenant_dir(tenant_id)
assert not tenant_path.exists()

if remote_storage_kind in available_s3_storages():
Expand Down Expand Up @@ -269,7 +269,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(

tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)

tenant_dir = env.tenant_dir(tenant_id)
tenant_dir = env.pageserver.tenant_dir(tenant_id)
# Check local is empty
assert not tenant_dir.exists()

Expand Down Expand Up @@ -366,7 +366,7 @@ def test_tenant_delete_is_resumed_on_attach(
env.endpoints.stop_all()
env.pageserver.stop()

dir_to_clear = env.pageserver.workdir / "tenants"
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)

Expand All @@ -379,7 +379,7 @@ def test_tenant_delete_is_resumed_on_attach(
wait_tenant_status_404(ps_http, tenant_id, iterations)

# we shouldn've created tenant dir on disk
tenant_path = env.tenant_dir(tenant_id=tenant_id)
tenant_path = env.pageserver.tenant_dir(tenant_id)
assert not tenant_path.exists()

if remote_storage_kind in available_s3_storages():
Expand Down
18 changes: 9 additions & 9 deletions test_runner/regress/test_tenant_detach.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
)

# assert tenant exists on disk
assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert env.pageserver.tenant_dir(tenant_id).exists()

endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
# we rely upon autocommit after each statement
Expand Down Expand Up @@ -329,7 +329,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
log.info("gc thread returned")

# check that nothing is left on disk for deleted tenant
assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert not env.pageserver.tenant_dir(tenant_id).exists()

with pytest.raises(
expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
Expand All @@ -354,7 +354,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
)

# assert tenant exists on disk
assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert env.pageserver.tenant_dir(tenant_id).exists()

endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
# we rely upon autocommit after each statement
Expand Down Expand Up @@ -383,7 +383,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
log.info("ignored tenant detached without error")

# check that nothing is left on disk for deleted tenant
assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert not env.pageserver.tenant_dir(tenant_id).exists()

# assert the tenant does not exists in the Pageserver
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
Expand All @@ -410,7 +410,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
)

# assert tenant exists on disk
assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert env.pageserver.tenant_dir(tenant_id).exists()

endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
# we rely upon autocommit after each statement
Expand All @@ -427,7 +427,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
log.info("regular tenant detached without error")

# check that nothing is left on disk for deleted tenant
assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
assert not env.pageserver.tenant_dir(tenant_id).exists()

# assert the tenant does not exists in the Pageserver
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
Expand Down Expand Up @@ -528,7 +528,7 @@ def test_ignored_tenant_reattach(
pageserver_http = env.pageserver.http_client()

ignored_tenant_id, _ = env.neon_cli.create_tenant()
tenant_dir = env.pageserver.workdir / "tenants" / str(ignored_tenant_id)
tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id)
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_before_ignore.sort()
timelines_before_ignore = [
Expand Down Expand Up @@ -619,7 +619,7 @@ def test_ignored_tenant_download_missing_layers(

# ignore the tenant and remove its layers
pageserver_http.tenant_ignore(tenant_id)
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
layers_removed = False
for dir_entry in timeline_dir.iterdir():
if dir_entry.name.startswith("00000"):
Expand Down Expand Up @@ -672,7 +672,7 @@ def test_ignored_tenant_stays_broken_without_metadata(

# ignore the tenant and remove its metadata
pageserver_http.tenant_ignore(tenant_id)
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
metadata_removed = False
for dir_entry in timeline_dir.iterdir():
if dir_entry.name == "metadata":
Expand Down
6 changes: 3 additions & 3 deletions test_runner/regress/test_tenant_relocation.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def switch_pg_to_new_pageserver(

endpoint.start()

timeline_to_detach_local_path = env.timeline_dir(tenant_id, timeline_id)
timeline_to_detach_local_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
files_before_detach = os.listdir(timeline_to_detach_local_path)
assert (
"metadata" in files_before_detach
Expand Down Expand Up @@ -561,7 +561,7 @@ def test_emergency_relocate_with_branches_slow_replay(
# simpler than initializing a new one from scratch, but the effect on the single tenant
# is the same.
env.pageserver.stop(immediate=True)
shutil.rmtree(env.pageserver.workdir / "tenants" / str(tenant_id))
shutil.rmtree(env.pageserver.tenant_dir(tenant_id))
env.pageserver.start()

# This fail point will pause the WAL ingestion on the main branch, after the
Expand Down Expand Up @@ -709,7 +709,7 @@ def test_emergency_relocate_with_branches_createdb(

# Kill the pageserver, remove the tenant directory, and restart
env.pageserver.stop(immediate=True)
shutil.rmtree(env.pageserver.workdir / "tenants" / str(tenant_id))
shutil.rmtree(env.pageserver.tenant_dir(tenant_id))
env.pageserver.start()

# Wait before ingesting the WAL for CREATE DATABASE on the main branch. The original
Expand Down
Loading

1 comment on commit e6985bd

@github-actions
Copy link

@github-actions github-actions bot commented on e6985bd Sep 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2540 tests run: 2416 passed, 0 failed, 124 skipped (full report)


Flaky tests (3)

Postgres 16

Postgres 14

  • test_heavy_write_workload[neon_off-10-5-5]: release

Code coverage (full report)

  • functions: 53.0% (7662 of 14455 functions)
  • lines: 80.9% (44734 of 55282 lines)

The comment gets automatically updated with the latest test results
e6985bd at 2023-09-15T11:55:07.362Z :recycle:

Please sign in to comment.