Skip to content

Commit

Permalink
async-profiler#1007: Optimize wall clock profiling
Browse files Browse the repository at this point in the history
  • Loading branch information
apangin authored Sep 26, 2024
1 parent a386afa commit f53bfd4
Show file tree
Hide file tree
Showing 22 changed files with 324 additions and 131 deletions.
4 changes: 4 additions & 0 deletions src/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ static inline u64 atomicInc(volatile u64& var, u64 increment = 1) {
return __sync_fetch_and_add(&var, increment);
}

static inline int atomicInc(volatile u32& var, int increment = 1) {
return __sync_fetch_and_add(&var, increment);
}

static inline int atomicInc(volatile int& var, int increment = 1) {
return __sync_fetch_and_add(&var, increment);
}
Expand Down
4 changes: 4 additions & 0 deletions src/arguments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ static const Multiplier UNIVERSAL[] = {{'n', 1}, {'u', 1000}, {'m', 1000000}, {'
// live - build allocation profile from live objects only
// lock[=DURATION] - profile contended locks overflowing the DURATION ns bucket (default: 10us)
// wall[=NS] - run wall clock profiling together with CPU profiling
// nobatch - legacy wall clock sampling without batch events
// collapsed - dump collapsed stacks (the format used by FlameGraph script)
// flamegraph - produce Flame Graph in HTML format
// tree - produce call tree in HTML format
Expand Down Expand Up @@ -335,6 +336,9 @@ Error Arguments::parse(const char* args) {
CASE("live")
_live = true;

CASE("nobatch")
_nobatch = true;

CASE("allkernel")
_ring = RING_KERNEL;

Expand Down
2 changes: 2 additions & 0 deletions src/arguments.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ class Arguments {
bool _threads;
bool _sched;
bool _live;
bool _nobatch;
bool _fdtransfer;
const char* _fdtransfer_path;
int _style;
Expand Down Expand Up @@ -231,6 +232,7 @@ class Arguments {
_threads(false),
_sched(false),
_live(false),
_nobatch(false),
_fdtransfer(false),
_fdtransfer_path(NULL),
_style(0),
Expand Down
4 changes: 2 additions & 2 deletions src/callTraceStorage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ u32 CallTraceStorage::put(int num_frames, ASGCT_CallFrame* frames, u64 counter)
return capacity - (INITIAL_CAPACITY - 1) + slot;
}

void CallTraceStorage::add(u32 call_trace_id, u64 counter) {
void CallTraceStorage::add(u32 call_trace_id, u64 samples, u64 counter) {
if (call_trace_id == OVERFLOW_TRACE_ID) {
return;
}
Expand All @@ -283,7 +283,7 @@ void CallTraceStorage::add(u32 call_trace_id, u64 counter) {
for (LongHashTable* table = _current_table; table != NULL; table = table->prev()) {
if (call_trace_id >= table->capacity()) {
CallTraceSample& s = table->values()[call_trace_id - table->capacity()];
atomicInc(s.samples);
atomicInc(s.samples, samples);
atomicInc(s.counter, counter);
break;
}
Expand Down
2 changes: 1 addition & 1 deletion src/callTraceStorage.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class CallTraceStorage {
void collectSamples(std::map<u64, CallTraceSample>& map);

u32 put(int num_frames, ASGCT_CallFrame* frames, u64 counter);
void add(u32 call_trace_id, u64 counter);
void add(u32 call_trace_id, u64 samples, u64 counter);
};

#endif // _CALLTRACESTORAGE
11 changes: 8 additions & 3 deletions src/converter/one/jfr/JfrReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ public class JfrReader implements Closeable {

private int executionSample;
private int nativeMethodSample;
private int wallClockSample;
private int allocationInNewTLAB;
private int allocationOutsideTLAB;
private int allocationSample;
Expand Down Expand Up @@ -166,7 +167,9 @@ public <E extends Event> E readEvent(Class<E> cls) throws IOException {
}

if (type == executionSample || type == nativeMethodSample) {
if (cls == null || cls == ExecutionSample.class) return (E) readExecutionSample();
if (cls == null || cls == ExecutionSample.class) return (E) readExecutionSample(false);
} else if (type == wallClockSample) {
if (cls == null || cls == ExecutionSample.class) return (E) readExecutionSample(true);
} else if (type == allocationInNewTLAB) {
if (cls == null || cls == AllocationSample.class) return (E) readAllocationSample(true);
} else if (type == allocationOutsideTLAB || type == allocationSample) {
Expand Down Expand Up @@ -199,12 +202,13 @@ public <E extends Event> E readEvent(Class<E> cls) throws IOException {
return null;
}

private ExecutionSample readExecutionSample() {
private ExecutionSample readExecutionSample(boolean hasSamples) {
long time = getVarlong();
int tid = getVarint();
int stackTraceId = getVarint();
int threadState = getVarint();
return new ExecutionSample(time, tid, stackTraceId, threadState);
int samples = hasSamples ? getVarint() : 1;
return new ExecutionSample(time, tid, stackTraceId, threadState, samples);
}

private AllocationSample readAllocationSample(boolean tlab) {
Expand Down Expand Up @@ -528,6 +532,7 @@ private void readFields(int count) {
private void cacheEventTypes() {
executionSample = getTypeId("jdk.ExecutionSample");
nativeMethodSample = getTypeId("jdk.NativeMethodSample");
wallClockSample = getTypeId("profiler.WallClockSample");
allocationInNewTLAB = getTypeId("jdk.ObjectAllocationInNewTLAB");
allocationOutsideTLAB = getTypeId("jdk.ObjectAllocationOutsideTLAB");
allocationSample = getTypeId("jdk.ObjectAllocationSample");
Expand Down
4 changes: 4 additions & 0 deletions src/converter/one/jfr/event/Event.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ public long classId() {
return 0;
}

public long samples() {
return 1;
}

public long value() {
return 1;
}
Expand Down
4 changes: 2 additions & 2 deletions src/converter/one/jfr/event/EventAggregator.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ public void collect(Event e) {
int i = hashCode(e) & mask;
while (keys[i] != null) {
if (sameGroup(keys[i], e)) {
values[i] += total ? e.value() : 1;
values[i] += total ? e.value() : e.samples();
return;
}
i = (i + 1) & mask;
}

keys[i] = e;
values[i] = total ? e.value() : 1;
values[i] = total ? e.value() : e.samples();

if (++size * 2 > keys.length) {
resize(keys.length * 2);
Expand Down
14 changes: 13 additions & 1 deletion src/converter/one/jfr/event/ExecutionSample.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,21 @@

public class ExecutionSample extends Event {
public final int threadState;
public final int samples;

public ExecutionSample(long time, int tid, int stackTraceId, int threadState) {
public ExecutionSample(long time, int tid, int stackTraceId, int threadState, int samples) {
super(time, tid, stackTraceId);
this.threadState = threadState;
this.samples = samples;
}

@Override
public long samples() {
return samples;
}

@Override
public long value() {
return samples;
}
}
3 changes: 2 additions & 1 deletion src/cpuEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ int CpuEngine::createForAllThreads() {
int result = EPERM;

ThreadList* thread_list = OS::listThreads();
for (int tid; (tid = thread_list->next()) != -1; ) {
while (thread_list->hasNext()) {
int tid = thread_list->next();
int err = createForThread(tid);
if (isResourceLimit(err)) {
result = err;
Expand Down
8 changes: 8 additions & 0 deletions src/event.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
enum EventType {
PERF_SAMPLE,
EXECUTION_SAMPLE,
WALL_CLOCK_SAMPLE,
INSTRUMENTED_METHOD,
ALLOC_SAMPLE,
ALLOC_OUTSIDE_TLAB,
Expand All @@ -39,6 +40,13 @@ class ExecutionEvent : public Event {
ExecutionEvent(u64 start_time) : _start_time(start_time), _thread_state(THREAD_UNKNOWN) {}
};

class WallClockEvent : public Event {
public:
u64 _start_time;
ThreadState _thread_state;
u32 _samples;
};

class AllocEvent : public EventWithClassId {
public:
u64 _start_time;
Expand Down
16 changes: 16 additions & 0 deletions src/flightRecorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -827,12 +827,14 @@ class Recording {
}
if (args._wall >= 0) {
writeIntSetting(buf, T_EXECUTION_SAMPLE, "wall", args._wall);
writeBoolSetting(buf, T_EXECUTION_SAMPLE, "nobatch", args._nobatch);
}

writeBoolSetting(buf, T_ALLOC_IN_NEW_TLAB, "enabled", args._alloc >= 0);
writeBoolSetting(buf, T_ALLOC_OUTSIDE_TLAB, "enabled", args._alloc >= 0);
if (args._alloc >= 0) {
writeIntSetting(buf, T_ALLOC_IN_NEW_TLAB, "alloc", args._alloc);
writeBoolSetting(buf, T_ALLOC_IN_NEW_TLAB, "live", args._live);
}

writeBoolSetting(buf, T_MONITOR_ENTER, "enabled", args._lock >= 0);
Expand Down Expand Up @@ -1187,6 +1189,17 @@ class Recording {
buf->put8(start, buf->offset() - start);
}

void recordWallClockSample(Buffer* buf, int tid, u32 call_trace_id, WallClockEvent* event) {
int start = buf->skip(1);
buf->put8(T_WALL_CLOCK_SAMPLE);
buf->putVar64(event->_start_time);
buf->putVar32(tid);
buf->putVar32(call_trace_id);
buf->putVar32(event->_thread_state);
buf->putVar32(event->_samples);
buf->put8(start, buf->offset() - start);
}

void recordAllocationInNewTLAB(Buffer* buf, int tid, u32 call_trace_id, AllocEvent* event) {
int start = buf->skip(1);
buf->put8(T_ALLOC_IN_NEW_TLAB);
Expand Down Expand Up @@ -1459,6 +1472,9 @@ void FlightRecorder::recordEvent(int lock_index, int tid, u32 call_trace_id,
case INSTRUMENTED_METHOD:
_rec->recordExecutionSample(buf, tid, call_trace_id, (ExecutionEvent*)event);
break;
case WALL_CLOCK_SAMPLE:
_rec->recordWallClockSample(buf, tid, call_trace_id, (WallClockEvent*)event);
break;
case ALLOC_SAMPLE:
_rec->recordAllocationInNewTLAB(buf, tid, call_trace_id, (AllocEvent*)event);
break;
Expand Down
8 changes: 8 additions & 0 deletions src/jfrMetadata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,14 @@ JfrMetadata::JfrMetadata() : Element("root") {
<< field("allocationSize", T_LONG, "Allocation Size", F_BYTES)
<< field("allocationTime", T_LONG, "Allocation Time", F_TIME_TICKS))

<< (type("profiler.WallClockSample", T_WALL_CLOCK_SAMPLE, "Wall Clock Sample")
<< category("Java Virtual Machine", "Profiling")
<< field("startTime", T_LONG, "Start Time", F_TIME_TICKS)
<< field("sampledThread", T_THREAD, "Thread", F_CPOOL)
<< field("stackTrace", T_STACK_TRACE, "Stack Trace", F_CPOOL)
<< field("state", T_THREAD_STATE, "Thread State", F_CPOOL)
<< field("samples", T_INT, "Samples", F_UNSIGNED))

<< (type("jdk.jfr.Label", T_LABEL, NULL)
<< field("value", T_STRING))

Expand Down
1 change: 1 addition & 0 deletions src/jfrMetadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ enum JfrType {
T_LOG = 115,
T_WINDOW = 116,
T_LIVE_OBJECT = 117,
T_WALL_CLOCK_SAMPLE = 118,

T_ANNOTATION = 200,
T_LABEL = 201,
Expand Down
2 changes: 1 addition & 1 deletion src/objectSampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ class LiveRefs {

int tid = _values[i].trace >> 32;
u32 call_trace_id = (u32)_values[i].trace;
profiler->recordExternalSample(event._alloc_size, tid, LIVE_OBJECT, &event, call_trace_id);
profiler->recordExternalSamples(1, event._alloc_size, tid, call_trace_id, LIVE_OBJECT, &event);
}
jni->DeleteWeakGlobalRef(w);
}
Expand Down
19 changes: 17 additions & 2 deletions src/os.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,25 @@ enum ThreadState {


class ThreadList {
protected:
u32 _index;
u32 _count;

ThreadList() : _index(0), _count(0) {
}

public:
virtual ~ThreadList() {}
virtual void rewind() = 0;

u32 index() const { return _index; }
u32 count() const { return _count; }

bool hasNext() const {
return _index < _count;
}

virtual int next() = 0;
virtual int size() = 0;
virtual void update() = 0;
};


Expand Down Expand Up @@ -66,6 +80,7 @@ class OS {
static const char* schedPolicy(int thread_id);
static bool threadName(int thread_id, char* name_buf, size_t name_len);
static ThreadState threadState(int thread_id);
static u64 threadCpuTime(int thread_id);
static ThreadList* listThreads();

static bool isLinux();
Expand Down
Loading

0 comments on commit f53bfd4

Please sign in to comment.