Skip to content

Commit

Permalink
Fixing sharding integration tests (#668)
Browse files Browse the repository at this point in the history
* create data dir as well on 'cargo make run', as db creation can fail without it

* add option for file truncation in tracing config

* fix file truncation in tracing

* flush redis in sharding coordinated scenario

* temp tweaks for testing sharding

* quickfix for drifting worker-executor assignments

* add with_env_overrides to tracing config to support env vars in tests

* make fix

* cleanup sharding-tests-debug

* extract EnvBasedTestDependenciesConfig

* add number_of_shards_override to EnvBasedTestDependenciesConfig

* set number of shard from test in sharding integ tests

* remove docker containers by default and add option to keep them for debugging

* fix redis flushing and shard manager restart + send env command cleanup

* rename (add blocking_new prefix)

* flush redis at the start

* fix naming: "shards" -> worker_executor

* more naming / cleanups

* move and extract sharding specific test dependency functions into Deps + cleanups

* check fixes

* add option for changing the number of shards on (spawned) shard-manager restarts

* hide forwarded logs in sharding debug (as most of the logs go into files)

* add Step::invoke_and_await to help defining interleave patterns

* reorder to pull up coordinated tests

* extract scenario fragment and add more variants

* temp increase timeout

* remove exact test filter from sharding-tests-debug

* rename / ignore tests

* fix sharding manager restart to remember number_of_shards_override and re-enable tests

* add / cleanup ShardService debug logs

* use JoinSet in sharding test's invoke_and_await_workers

* cleanup messages

* add combined scenarios, drop some from the simple ones

* fix cli tests deps

* fix cli tests deps

* tweak test cases

* log error before panic

* add more connection error patterns

* refactor is_connection_failure into IsRetriableError and use typed errors

* decrease retries to be faster on CI

* decrease retries even more for now on CI, but add option to override it locally with env var
  • Loading branch information
noise64 authored Jul 22, 2024
1 parent 62f5c4d commit 8c80bf8
Show file tree
Hide file tree
Showing 39 changed files with 1,086 additions and 495 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ jobs:
path: target/${{ matrix.platform.target }}.tar
docker-publish:
runs-on: ubuntu-latest
needs: [docker-targets-build]
needs: [ docker-targets-build ]
if: github.event_name == 'push' && github.ref_type == 'tag'
steps:
- name: Checkout
Expand Down
21 changes: 20 additions & 1 deletion Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# - `cargo make integration-tests`: runs integration tests only
# - `cargo make cli-tests`: runs CLI tests only
# - `cargo make sharding-tests`: runs sharding integration tests only
# - `cargo make sharding-tests-debug`: runs sharding integration tests with file logging enabled, also accepts test name filter arguments
# - `cargo make test`: runs all unit tests, worker executor tests and integration tests
# - `cargo make check-openapi`: generates openapi spec from the code and checks if it is the same as the one in the openapi directory (for CI)
# - `cargo make generate-openapi`: generates openapi spec from the code and saves it to the openapi directory
Expand Down Expand Up @@ -253,6 +254,24 @@ args = [
"--test-threads=1",
]

[tasks.sharding-tests-debug]
dependencies = ["build-bins"]
script = '''
rm -rf logs data
mkdir -pv logs data
export RUST_LOG=info,golem_test_framework::components=WARN
export RUST_BACKTRACE=1
export GOLEM__TRACING__FILE_DIR=../logs
export GOLEM__TRACING__FILE_TRUNCATE=false
export GOLEM__TRACING__FILE__ENABLED=true
cargo test \
--package integration-tests \
--test sharding ${@} \
-- --nocapture --test-threads=1
'''

[tasks.cli-tests]
description = "Runs CLI tests only"
dependencies = ["build-bins"]
Expand Down Expand Up @@ -478,7 +497,7 @@ condition = { fail_message = "Requires lnav, nginx and redis on path. Install th
condition_script = ["nginx -v", "lnav --version", "redis-server --version"]

script = '''
mkdir -pv logs
mkdir -pv data logs
redis-server --port 6380 --save "" --appendonly no &> logs/redis.log &
redis_pid=$!
Expand Down
17 changes: 13 additions & 4 deletions golem-cli/tests/main.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
use golem_test_framework::config::{EnvBasedTestDependencies, TestDependencies};
use libtest_mimic::{Arguments, Conclusion, Failed};
use std::sync::Arc;

use libtest_mimic::{Arguments, Conclusion, Failed};
use tracing::info;

use golem_test_framework::config::{
EnvBasedTestDependencies, EnvBasedTestDependenciesConfig, TestDependencies,
};

mod api_definition;
mod api_deployment;
pub mod cli;
Expand All @@ -29,8 +33,13 @@ fn run(deps: Arc<dyn TestDependencies + Send + Sync + 'static>) -> Conclusion {
fn main() -> Result<(), Failed> {
env_logger::init();

let deps: Arc<dyn TestDependencies + Send + Sync + 'static> =
Arc::new(EnvBasedTestDependencies::blocking_new(3));
let deps: Arc<dyn TestDependencies + Send + Sync + 'static> = Arc::new(
EnvBasedTestDependencies::blocking_new_from_config(EnvBasedTestDependenciesConfig {
worker_executor_cluster_size: 3,
keep_docker_containers: true, // will be dropped by testcontainers (current version does not support double rm)
..EnvBasedTestDependenciesConfig::new()
}),
);
let cluster = deps.worker_executor_cluster(); // forcing startup by getting it
info!("Using cluster with {:?} worker executors", cluster.size());

Expand Down
6 changes: 5 additions & 1 deletion golem-common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ impl<T: ConfigLoaderConfig> ConfigLoader<T> {
Figment::new()
.merge(Serialized::defaults(T::default()))
.merge(Toml::file_exact(self.config_file_name.clone()))
.merge(Env::prefixed(ENV_VAR_PREFIX).split(ENV_VAR_NESTED_SEPARATOR))
.merge(env_config_provider())
}

pub fn load(&self) -> figment::Result<T> {
Expand Down Expand Up @@ -413,3 +413,7 @@ impl RetryConfig {
}
}
}

pub fn env_config_provider() -> Env {
Env::prefixed(ENV_VAR_PREFIX).split(ENV_VAR_NESTED_SEPARATOR)
}
58 changes: 43 additions & 15 deletions golem-common/src/tracing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::fs::File;
use std::fs::OpenOptions;
use std::io::stdout;
use std::path::Path;
use std::sync::Arc;

use crate::config::env_config_provider;
use crate::tracing::format::JsonFlattenSpanFormatter;
use figment::providers::Serialized;
use figment::Figment;
use serde::{Deserialize, Serialize};
use tracing::info;
use tracing_subscriber::fmt::format::FmtSpan;
Expand All @@ -26,8 +30,6 @@ use tracing_subscriber::util::SubscriberInitExt;
use tracing_subscriber::Layer;
use tracing_subscriber::Registry;

use crate::tracing::format::JsonFlattenSpanFormatter;

pub enum Output {
Stdout,
File,
Expand Down Expand Up @@ -135,50 +137,66 @@ pub struct TracingConfig {
pub file: OutputConfig,
pub file_dir: Option<String>,
pub file_name: Option<String>,
pub file_truncate: bool,
pub console: bool,
pub dtor_friendly: bool,
}

impl TracingConfig {
pub fn local_dev(service_name: &str) -> Self {
pub fn local_dev(name: &str) -> Self {
Self {
stdout: OutputConfig::text_ansi(),
file: OutputConfig {
enabled: false,
..OutputConfig::json_flatten_span()
},
file_dir: None,
file_name: Some(format!("{}.log", service_name)),
file_name: Some(format!("{}.log", name)),
file_truncate: true,
console: false,
dtor_friendly: false,
}
}

pub fn test(service_name: &str) -> Self {
pub fn test(name: &str) -> Self {
Self {
dtor_friendly: true,
..Self::local_dev(service_name)
..Self::local_dev(name)
}
}

pub fn test_pretty(service_name: &str) -> Self {
let mut config = Self::test(service_name);
pub fn test_pretty(name: &str) -> Self {
let mut config = Self::test(name);
config.stdout.pretty = true;
config
}

pub fn test_pretty_without_time(service_name: &str) -> Self {
let mut config = Self::test(service_name);
pub fn test_pretty_without_time(name: &str) -> Self {
let mut config = Self::test(name);
config.stdout.pretty = true;
config.stdout.without_time = true;
config
}

pub fn test_compact(service_name: &str) -> Self {
let mut config = Self::test(service_name);
pub fn test_compact(name: &str) -> Self {
let mut config = Self::test(name);
config.stdout.compact = true;
config
}

pub fn with_env_overrides(self) -> Self {
#[derive(Serialize, Deserialize)]
struct Config {
tracing: TracingConfig,
}

Figment::new()
.merge(Serialized::defaults(Config { tracing: self }))
.merge(env_config_provider())
.extract::<Config>()
.expect("Failed to load tracing config env overrides")
.tracing
}
}

impl Default for TracingConfig {
Expand All @@ -191,6 +209,7 @@ impl Default for TracingConfig {
},
file_dir: None,
file_name: None,
file_truncate: true,
console: false,
dtor_friendly: false,
}
Expand Down Expand Up @@ -341,9 +360,18 @@ where
match &config.file_name {
Some(file_name) if config.file.enabled => {
let file_path = Path::new(config.file_dir.as_deref().unwrap_or(".")).join(file_name);
let file = File::create(file_path.clone()).unwrap_or_else(|err| {
panic!("cannot create log file: {:?}, error: {}", file_path, err)

let mut open_options = OpenOptions::new();
if config.file_truncate {
open_options.write(true).create(true).truncate(true);
} else {
open_options.append(true).create(true).truncate(false);
}

let file = open_options.open(&file_path).unwrap_or_else(|err| {
panic!("cannot create log file: {:?}, error: {}", &file_path, err)
});

layers.push(make_layer(
&config.file,
make_filter(Output::File),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ GOLEM__TRACING__CONSOLE=false
GOLEM__TRACING__DTOR_FRIENDLY=false
#GOLEM__TRACING__FILE_DIR=
GOLEM__TRACING__FILE_NAME="component-compilation-service.log"
GOLEM__TRACING__FILE_TRUNCATE=true
GOLEM__TRACING__FILE__ANSI=false
GOLEM__TRACING__FILE__COMPACT=false
GOLEM__TRACING__FILE__ENABLED=false
Expand Down Expand Up @@ -72,6 +73,7 @@ GOLEM__TRACING__CONSOLE=false
GOLEM__TRACING__DTOR_FRIENDLY=false
#GOLEM__TRACING__FILE_DIR=
GOLEM__TRACING__FILE_NAME="component-compilation-service.log"
GOLEM__TRACING__FILE_TRUNCATE=true
GOLEM__TRACING__FILE__ANSI=false
GOLEM__TRACING__FILE__COMPACT=false
GOLEM__TRACING__FILE__ENABLED=false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ port = 9090
console = false
dtor_friendly = false
file_name = "component-compilation-service.log"
file_truncate = true

[tracing.file]
ansi = false
Expand Down Expand Up @@ -106,6 +107,7 @@ without_time = false
# console = false
# dtor_friendly = false
# file_name = "component-compilation-service.log"
# file_truncate = true
#
# [tracing.file]
# ansi = false
Expand Down
2 changes: 2 additions & 0 deletions golem-component-service/config/component-service.sample.env
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ GOLEM__TRACING__CONSOLE=false
GOLEM__TRACING__DTOR_FRIENDLY=false
#GOLEM__TRACING__FILE_DIR=
GOLEM__TRACING__FILE_NAME="component-service.log"
GOLEM__TRACING__FILE_TRUNCATE=true
GOLEM__TRACING__FILE__ANSI=false
GOLEM__TRACING__FILE__COMPACT=false
GOLEM__TRACING__FILE__ENABLED=false
Expand Down Expand Up @@ -56,6 +57,7 @@ GOLEM__TRACING__CONSOLE=false
GOLEM__TRACING__DTOR_FRIENDLY=false
#GOLEM__TRACING__FILE_DIR=
GOLEM__TRACING__FILE_NAME="component-service.log"
GOLEM__TRACING__FILE_TRUNCATE=true
GOLEM__TRACING__FILE__ANSI=false
GOLEM__TRACING__FILE__COMPACT=false
GOLEM__TRACING__FILE__ENABLED=false
Expand Down
2 changes: 2 additions & 0 deletions golem-component-service/config/component-service.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ max_connections = 10
console = false
dtor_friendly = false
file_name = "component-service.log"
file_truncate = true

[tracing.file]
ansi = false
Expand Down Expand Up @@ -84,6 +85,7 @@ without_time = false
# console = false
# dtor_friendly = false
# file_name = "component-service.log"
# file_truncate = true
#
# [tracing.file]
# ansi = false
Expand Down
32 changes: 10 additions & 22 deletions golem-service-base/src/routing_table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,20 @@ use async_trait::async_trait;
use serde::Deserialize;
use serde::Serialize;
use tonic::transport::Channel;
use tonic::Status;

use golem_api_grpc::proto::golem::shardmanager;
use golem_api_grpc::proto::golem::shardmanager::shard_manager_service_client::ShardManagerServiceClient;
use golem_api_grpc::proto::golem::shardmanager::ShardManagerError;
use golem_common::cache::*;
use golem_common::client::GrpcClient;
use golem_common::model::RoutingTable;

#[derive(Debug, Clone)]
pub enum RoutingTableError {
Unexpected(String),
GrpcError(Status),
ShardManagerError(ShardManagerError),
NoResult,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
Expand All @@ -53,12 +57,6 @@ impl Default for RoutingTableConfig {
}
}

impl RoutingTableError {
pub fn unexpected(details: impl Into<String>) -> Self {
RoutingTableError::Unexpected(details.into())
}
}

#[async_trait]
pub trait RoutingTableService {
async fn get_routing_table(&self) -> Result<RoutingTable, RoutingTableError>;
Expand Down Expand Up @@ -105,10 +103,7 @@ impl RoutingTableService for RoutingTableServiceDefault {
.get_routing_table(shardmanager::GetRoutingTableRequest {})))
.await
.map_err(|err| {
RoutingTableError::unexpected(format!(
"Getting routing table from shard manager failed with {}",
err
))
RoutingTableError::GrpcError(err)
})?;
match response.into_inner() {
shardmanager::GetRoutingTableResponse {
Expand All @@ -117,15 +112,10 @@ impl RoutingTableService for RoutingTableServiceDefault {
} => Ok(routing_table.into()),
shardmanager::GetRoutingTableResponse {
result: Some(shardmanager::get_routing_table_response::Result::Failure(failure)),
} => Err(RoutingTableError::unexpected(format!(
"Getting routing table from shard manager failed with shard manager error {:?}",
failure
))),
} => Err(RoutingTableError::ShardManagerError(failure)),
shardmanager::GetRoutingTableResponse { result: None } => {
Err(RoutingTableError::unexpected(
"Getting routing table from shard manager failed with unknown error",
))
}
Err(RoutingTableError::NoResult)
},
}
})
})
Expand All @@ -142,9 +132,7 @@ pub struct RoutingTableServiceNoop {}
#[async_trait]
impl RoutingTableService for RoutingTableServiceNoop {
async fn get_routing_table(&self) -> Result<RoutingTable, RoutingTableError> {
Err(RoutingTableError::unexpected(
"Routing table service is not configured",
))
Err(RoutingTableError::NoResult)
}

async fn invalidate_routing_table(&self) {}
Expand Down
2 changes: 2 additions & 0 deletions golem-shard-manager/config/shard-manager.sample.env
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ GOLEM__TRACING__CONSOLE=false
GOLEM__TRACING__DTOR_FRIENDLY=false
#GOLEM__TRACING__FILE_DIR=
GOLEM__TRACING__FILE_NAME="shard-manager.log"
GOLEM__TRACING__FILE_TRUNCATE=true
GOLEM__TRACING__FILE__ANSI=false
GOLEM__TRACING__FILE__COMPACT=false
GOLEM__TRACING__FILE__ENABLED=false
Expand Down Expand Up @@ -73,6 +74,7 @@ GOLEM__TRACING__CONSOLE=false
GOLEM__TRACING__DTOR_FRIENDLY=false
#GOLEM__TRACING__FILE_DIR=
GOLEM__TRACING__FILE_NAME="shard-manager.log"
GOLEM__TRACING__FILE_TRUNCATE=true
GOLEM__TRACING__FILE__ANSI=false
GOLEM__TRACING__FILE__COMPACT=false
GOLEM__TRACING__FILE__ENABLED=false
Expand Down
Loading

0 comments on commit 8c80bf8

Please sign in to comment.