Skip to content

Commit

Permalink
Print a log message if GetPage response takes too long (#10046)
Browse files Browse the repository at this point in the history
We have metrics for GetPage request latencies, but this is an extra
measure to capture requests that take way too long in the logs. The log
message is printed every 10 s, until the response is received:

```
PG:2024-12-09 16:02:07.715 GMT [1782845] LOG:  [NEON_SMGR] [shard 0] no response received from pageserver for 10.000 s, still waiting (sent 10613 requests, received 10612 responses)
PG:2024-12-09 16:02:17.723 GMT [1782845] LOG:  [NEON_SMGR] [shard 0] no response received from pageserver for 20.008 s, still waiting (sent 10613 requests, received 10612 responses)
PG:2024-12-09 16:02:19.719 GMT [1782845] LOG:  [NEON_SMGR] [shard 0] received response from pageserver after 22.006 s
```
  • Loading branch information
hlinnaka authored Dec 10, 2024
1 parent 6ad9982 commit b853f78
Showing 1 changed file with 66 additions and 1 deletion.
67 changes: 66 additions & 1 deletion pgxn/neon/libpagestore.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "libpq/pqformat.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "portability/instr_time.h"
#include "postmaster/interrupt.h"
#include "storage/buf_internals.h"
#include "storage/ipc.h"
Expand Down Expand Up @@ -118,6 +119,11 @@ typedef struct
*/
PSConnectionState state;
PGconn *conn;

/* request / response counters for debugging */
uint64 nrequests_sent;
uint64 nresponses_received;

/*---
* WaitEventSet containing:
* - WL_SOCKET_READABLE on 'conn'
Expand Down Expand Up @@ -628,6 +634,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
}

shard->state = PS_Connected;
shard->nrequests_sent = 0;
shard->nresponses_received = 0;
}
/* FALLTHROUGH */
case PS_Connected:
Expand Down Expand Up @@ -656,16 +664,40 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
int ret;
PageServer *shard = &page_servers[shard_no];
PGconn *pageserver_conn = shard->conn;
instr_time now,
start_ts,
since_start,
last_log_ts,
since_last_log;
bool logged = false;

/*
* As a debugging aid, if we don't get a response for a long time, print a
* log message.
*
* 10 s is a very generous threshold, normally we expect a response in a
* few milliseconds. We have metrics to track latencies in normal ranges,
* but in the cases that take exceptionally long, it's useful to log the
* exact timestamps.
*/
#define LOG_INTERVAL_US UINT64CONST(10 * 1000000)

INSTR_TIME_SET_CURRENT(now);
start_ts = last_log_ts = now;
INSTR_TIME_SET_ZERO(since_last_log);

retry:
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );

if (ret == 0)
{
WaitEvent event;
long timeout;

timeout = Min(0, LOG_INTERVAL_US - INSTR_TIME_GET_MICROSEC(since_last_log));

/* Sleep until there's something to do */
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
(void) WaitEventSetWait(shard->wes_read, timeout, &event, 1,
WAIT_EVENT_NEON_PS_READ);
ResetLatch(MyLatch);

Expand All @@ -684,9 +716,40 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
}
}

/*
* Print a message to the log if a long time has passed with no
* response.
*/
INSTR_TIME_SET_CURRENT(now);
since_last_log = now;
INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
if (INSTR_TIME_GET_MICROSEC(since_last_log) >= LOG_INTERVAL_US)
{
since_start = now;
INSTR_TIME_SUBTRACT(since_start, start_ts);
neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
INSTR_TIME_GET_DOUBLE(since_start),
shard->nrequests_sent, shard->nresponses_received);
last_log_ts = now;
logged = true;
}

goto retry;
}

/*
* If we logged earlier that the response is taking a long time, log
* another message when the response is finally received.
*/
if (logged)
{
INSTR_TIME_SET_CURRENT(now);
since_start = now;
INSTR_TIME_SUBTRACT(since_start, start_ts);
neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s",
INSTR_TIME_GET_DOUBLE(since_start));
}

return ret;
}

Expand Down Expand Up @@ -786,6 +849,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
* PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
* point, but on the grand scheme of things it's only a small issue.
*/
shard->nrequests_sent++;
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
Expand Down Expand Up @@ -878,6 +942,7 @@ pageserver_receive(shardno_t shard_no)
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
}

shard->nresponses_received++;
return (NeonResponse *) resp;
}

Expand Down

1 comment on commit b853f78

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7062 tests run: 6746 passed, 1 failed, 315 skipped (full report)


Failures on Postgres 17

  • test_pageserver_small_inmemory_layers[False]: debug-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_pageserver_small_inmemory_layers[debug-pg17-False]"
Flaky tests (2)

Postgres 17

Test coverage report is not available

The comment gets automatically updated with the latest test results
b853f78 at 2024-12-10T17:29:40.605Z :recycle:

Please sign in to comment.