From 8b6057c03d587beeb774641774d79fd84bbaa28e Mon Sep 17 00:00:00 2001 From: kalibera Date: Mon, 7 Aug 2023 16:44:35 +0000 Subject: [PATCH] Support wall-clock time profiling on Unix (PR#18076). git-svn-id: https://svn.r-project.org/R/trunk@84898 00db46b3-68df-0310-9c12-caf00c1e9a41 --- src/library/utils/R/Rprof.R | 8 +- src/library/utils/man/Rprof.Rd | 84 ++++++----- src/library/utils/src/init.c | 4 +- src/main/eval.c | 249 +++++++++++++++++++++++++-------- 4 files changed, 247 insertions(+), 98 deletions(-) diff --git a/src/library/utils/R/Rprof.R b/src/library/utils/R/Rprof.R index 0120299ec91..40b670782e8 100644 --- a/src/library/utils/R/Rprof.R +++ b/src/library/utils/R/Rprof.R @@ -1,7 +1,7 @@ # File src/library/utils/R/Rprof.R # Part of the R package, https://www.R-project.org # -# Copyright (C) 1995-2013 The R Core Team +# Copyright (C) 1995-2023 The R Core Team # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -19,12 +19,14 @@ Rprof <- function(filename = "Rprof.out", append = FALSE, interval = 0.02, memory.profiling = FALSE, gc.profiling = FALSE, line.profiling = FALSE, filter.callframes = FALSE, - numfiles = 100L, bufsize = 10000L) + numfiles = 100L, bufsize = 10000L, + event = c("default", "cpu", "wall")) { + event <- match.arg(event) if(is.null(filename)) filename <- "" invisible(.External(C_Rprof, filename, append, interval, memory.profiling, gc.profiling, line.profiling, filter.callframes, - numfiles, bufsize)) + numfiles, bufsize, event)) } Rprofmem <- function(filename = "Rprofmem.out", append = FALSE, threshold = 0) diff --git a/src/library/utils/man/Rprof.Rd b/src/library/utils/man/Rprof.Rd index 52400544917..6e3396f56f1 100644 --- a/src/library/utils/man/Rprof.Rd +++ b/src/library/utils/man/Rprof.Rd @@ -1,6 +1,6 @@ % File src/library/utils/man/Rprof.Rd % Part of the R package, https://www.R-project.org -% Copyright 1995-2022 R Core Team +% Copyright 1995-2023 R Core Team % Distributed under GPL 2 or later \name{Rprof} @@ -13,7 +13,8 @@ Rprof(filename = "Rprof.out", append = FALSE, interval = 0.02, memory.profiling = FALSE, gc.profiling = FALSE, line.profiling = FALSE, filter.callframes = FALSE, - numfiles = 100L, bufsize = 10000L) + numfiles = 100L, bufsize = 10000L, + event = c("default", "cpu", "wall")) } \arguments{ \item{filename}{ @@ -24,7 +25,7 @@ Rprof(filename = "Rprof.out", append = FALSE, interval = 0.02, logical: should the file be over-written or appended to? } \item{interval}{ - real: time interval between samples. + real: distance (time interval) between samples. } \item{memory.profiling}{logical: write memory use information to the file?} \item{gc.profiling}{logical: record whether GC is running?} @@ -32,41 +33,60 @@ Rprof(filename = "Rprof.out", append = FALSE, interval = 0.02, \item{filter.callframes}{logical: filter out intervening call frames of the call tree. See the filtering out call frames section.} \item{numfiles, bufsize}{integers: line profiling memory allocation} + \item{event}{ + character: profiling event, character vector of length one, + \code{"wall"} for wall-clock time and \code{"cpu"} for CPU + time, both measured in seconds. \code{"default"} is the default + event on the platform, one of the two. See the \sQuote{Details}. + } } \details{ Enabling profiling automatically disables any existing profiling to another or the same file. Profiling works by writing out the call stack every \code{interval} - seconds, to the file specified. Either the \code{\link{summaryRprof}} - function or the wrapper script \command{R CMD Rprof} can be used to - process the output file to produce a summary of the usage; use - \command{R CMD Rprof --help} for usage information. - - How time is measured varies by platform: - \describe{ - \item{On Windows:}{ - Exactly what the time interval measures is subtle: it is time that the - \R process is running and executing an \R command. It is not however just - CPU time, for if \code{readline()} is waiting for input, that counts - (on Windows, but not on a Unix-alike). - - Note that the timing interval cannot be too small, for the time spent - in each profiling step is added to the interval. What is feasible is - machine-dependent, but 10ms seemed as small as advisable on a 1GHz machine. - } - \item{On Unix-alikes}{it is the CPU - time of the \R process, so for example excludes time when \R is waiting - for input or for processes run by \code{\link{system}} to return. - - Note that the timing interval cannot usefully be too small: once the - timer goes off, the information is not recorded until the next timing - click (probably in the range 1--10 ms). - } - } - - On Linux, the interval needs to be at least 10ms, on all other platforms - at least 1ms. Shorter intervals will be rounded up with a warning. + seconds (units of the profiling event), to the file specified. Either the + \code{\link{summaryRprof}} function or the wrapper script \command{R CMD + Rprof} can be used to process the output file to produce a summary of the + usage; use \command{R CMD Rprof --help} for usage information. + + Exactly what is measured is subtle and depends on the profiling event. + + With \code{"wall"} (the default and only supported event on Windows): it + is time that the \R process is running and executing an \R command. It is + not however just CPU time, for if \code{readline()} is waiting for input, + that counts as well. + + With \code{"cpu"} (the default on Unix and typically the preferred event + for identifying performance bottlenecks), it is CPU time of the \R process, + so for example excludes time when \R is waiting for input or for processes + run by \code{\link{system}} to return. It may go slower than + \code{"wall"} when the process is often waiting for I/O to finish, but it + may go faster with actively computing concurrent threads (say via OpenMP) + on a multi-core system. + + Note that the (timing) interval cannot be too small. With \code{"cpu"}, + the time spent in each profiling step is currently added to the interval. + With all profiling events, the computation in each profiling step causes + perturbation to the observed system and biases the results. What is + feasible is machine-dependent. On Linux, R requires the interval to be at + least 10ms, on all other platforms at least 1ms. Shorter intervals will + be rounded up with a warning. + + The \code{"default"} profiling event is \code{"wall"} on Windows and + \code{"cpu"} on Unix. + + Support for \code{"wall"} event on Unix is new and considered + experimental. To reduce the risk of missing a sample, R tries to use the + (real-time) FIFO scheduling policy with the maximum scheduling priority + for an internal thread which initiates collection of each sample. If + setting that priority fails, it tries to use the maximum scheduling + priority of the current scheduling policy, falling back to the current + scheduling parameters. On Linux, regular users are typically not allowed + to use the real-time scheduling priorities. This can be usually allowed + via PAM (e.g. \file{/etc/security/limits.conf}), see the OS documentation + for details. The priorities only matter when profiling a system under + high load. Functions will only be recorded in the profile log if they put a context on the call stack (see \code{\link{sys.calls}}). Some diff --git a/src/library/utils/src/init.c b/src/library/utils/src/init.c index 338f2c8718e..73d0dc191cb 100644 --- a/src/library/utils/src/init.c +++ b/src/library/utils/src/init.c @@ -1,6 +1,6 @@ /* * R : A Computer Language for Statistical Data Analysis - * Copyright (C) 2012-2022 The R Core Team. + * Copyright (C) 2012-2023 The R Core Team. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -80,7 +80,7 @@ static const R_ExternalMethodDef ExtEntries[] = { EXTDEF(download, 6), #endif EXTDEF(unzip, 7), - EXTDEF(Rprof, 9), + EXTDEF(Rprof, 10), EXTDEF(Rprofmem, 3), EXTDEF(countfields, 6), diff --git a/src/main/eval.c b/src/main/eval.c index 0f63a62a43d..64663442818 100644 --- a/src/main/eval.c +++ b/src/main/eval.c @@ -106,6 +106,16 @@ static int R_Profiling = 0; # endif #endif /* not Win32 */ +#if !defined(Win32) && defined(HAVE_PTHREAD) +// is needed for pthread_kill on most platforms (and by POSIX +// but apparently not FreeBSD): it is included above. +# include +# ifdef HAVE_SCHED_H +# include +# endif +static pthread_t R_profiled_thread; +#endif + #ifdef Win32 static FILE *R_ProfileOutfile = NULL; #else @@ -118,13 +128,27 @@ static int R_Line_Profiling = 0; /* indicates line profiling, static char **R_Srcfiles; /* an array of pointers into the filename buffer */ static size_t R_Srcfile_bufcount; /* how big is the array above? */ static SEXP R_Srcfiles_buffer = NULL; /* a big RAWSXP to use as a buffer for filenames and pointers to them */ -static int R_Profiling_Error; /* record errors here */ +static int R_Profiling_Error; /* record errors here */ static int R_Filter_Callframes = 0; /* whether to record only the trailing branch of call trees */ +typedef enum { RPE_CPU, RPE_WALL } rpe_type; /* profiling event, CPU time or wall-clock time */ +static rpe_type R_Profiling_Event; + #ifdef Win32 HANDLE MainThread; HANDLE ProfileEvent; -#endif /* Win32 */ +#else +# ifdef HAVE_PTHREAD +typedef struct { + pthread_t thread; + pthread_mutex_t terminate_mu; + pthread_cond_t terminate_cv; + int should_terminate; + int interval_us; +} R_profile_thread_info_t; +static R_profile_thread_info_t R_Profile_Thread_Info; +# endif +#endif /* Careful here! These functions are called asynchronously, maybe in the middle of GC, so don't do any allocations. They get called in a signal @@ -331,12 +355,6 @@ static void lineprof(profbuf* pb, SEXP srcref) } } -#if !defined(Win32) && defined(HAVE_PTHREAD) -// is needed for pthread_kill on most platforms (and by POSIX -// but apparently not FreeBSD): it is included above. -# include -static pthread_t R_profiled_thread; -#endif #if defined(__APPLE__) #include @@ -429,22 +447,26 @@ static void doprof(int sig) /* sig is ignored in Windows */ #ifdef Win32 SuspendThread(MainThread); #elif defined(__APPLE__) - /* Using Mach thread API to detect whether we are on the main thread, - because pthread_self() sometimes crashes R due to a page fault when - the signal handler runs just after the new thread is created, but - before pthread initialization has been finished. */ - mach_port_t id = mach_thread_self(); - mach_port_deallocate(mach_task_self(), id); - if (id != R_profiled_thread_id) { - pthread_kill(R_profiled_thread, sig); - errno = old_errno; - return; + if (R_Profiling_Event == RPE_CPU) { + /* Using Mach thread API to detect whether we are on the main thread, + because pthread_self() sometimes crashes R due to a page fault when + the signal handler runs just after the new thread is created, but + before pthread initialization has been finished. */ + mach_port_t id = mach_thread_self(); + mach_port_deallocate(mach_task_self(), id); + if (id != R_profiled_thread_id) { + pthread_kill(R_profiled_thread, sig); + errno = old_errno; + return; + } } #elif defined(HAVE_PTHREAD) - if (! pthread_equal(pthread_self(), R_profiled_thread)) { - pthread_kill(R_profiled_thread, sig); - errno = old_errno; - return; + if (R_Profiling_Event == RPE_CPU) { + if (! pthread_equal(pthread_self(), R_profiled_thread)) { + pthread_kill(R_profiled_thread, sig); + errno = old_errno; + return; + } } #endif /* Win32 */ @@ -572,7 +594,7 @@ static void doprof(int sig) /* sig is ignored in Windows */ /* Profiling thread main function */ static void __cdecl ProfileThread(void *pwait) { - int wait = *((int *)pwait); + int wait = *((int *)pwait); /* milliseconds */ SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST); while(WaitForSingleObject(ProfileEvent, wait) != WAIT_OBJECT_0) { @@ -580,6 +602,35 @@ static void __cdecl ProfileThread(void *pwait) } } #else /* not Win32 */ +/* Profiling thread main function */ +static void *ProfileThread(void *pinfo) +{ +#ifdef HAVE_PTHREAD + R_profile_thread_info_t *nfo = pinfo; + + pthread_mutex_lock(&nfo->terminate_mu); + while(!nfo->should_terminate) { + struct timespec until; + double duntil_s = currentTime() + nfo->interval_us / 1e6; + + until.tv_sec = (time_t) duntil_s; + until.tv_nsec = (long) (1e9 * (duntil_s - until.tv_sec)); + + for(;;) { + int res = pthread_cond_timedwait(&nfo->terminate_cv, + &nfo->terminate_mu, &until); + if (nfo->should_terminate) + break; + if (res == ETIMEDOUT) { + pthread_kill(R_profiled_thread, SIGPROF); + break; + } + } + } + pthread_mutex_unlock(&nfo->terminate_mu); +#endif + return NULL; +} static void doprof_null(int sig) { signal(SIGPROF, doprof_null); @@ -595,13 +646,25 @@ static void R_EndProfiling(void) if(R_ProfileOutfile) fclose(R_ProfileOutfile); R_ProfileOutfile = NULL; #else /* not Win32 */ - struct itimerval itv; - - itv.it_interval.tv_sec = 0; - itv.it_interval.tv_usec = 0; - itv.it_value.tv_sec = 0; - itv.it_value.tv_usec = 0; - setitimer(ITIMER_PROF, &itv, NULL); + if (R_Profiling_Event == RPE_CPU) { + struct itimerval itv; + + itv.it_interval.tv_sec = 0; + itv.it_interval.tv_usec = 0; + itv.it_value.tv_sec = 0; + itv.it_value.tv_usec = 0; + setitimer(ITIMER_PROF, &itv, NULL); + } + if (R_Profiling_Event == RPE_WALL) { + R_profile_thread_info_t *nfo = &R_Profile_Thread_Info; + pthread_mutex_lock(&nfo->terminate_mu); + nfo->should_terminate = 1; + pthread_cond_signal(&nfo->terminate_cv); + pthread_mutex_unlock(&nfo->terminate_mu); + pthread_join(nfo->thread, NULL); + pthread_cond_destroy(&nfo->terminate_cv); + pthread_mutex_destroy(&nfo->terminate_mu); + } signal(SIGPROF, doprof_null); if(R_ProfileOutfile >= 0) close(R_ProfileOutfile); R_ProfileOutfile = -1; @@ -625,10 +688,9 @@ static void R_EndProfiling(void) static void R_InitProfiling(SEXP filename, int append, double dinterval, int mem_profiling, int gc_profiling, int line_profiling, int filter_callframes, - int numfiles, int bufsize) + int numfiles, int bufsize, rpe_type event) { #ifndef Win32 - struct itimerval itv; const void *vmax = vmaxget(); if(R_ProfileOutfile >= 0) R_EndProfiling(); @@ -689,6 +751,8 @@ static void R_InitProfiling(SEXP filename, int append, double dinterval, *(R_Srcfiles[0]) = '\0'; } + R_Profiling_Event = event; + #ifdef Win32 /* need to duplicate to make a real handle */ DuplicateHandle(Proc, GetCurrentThread(), Proc, &MainThread, @@ -699,39 +763,81 @@ static void R_InitProfiling(SEXP filename, int append, double dinterval, R_Suicide("unable to create profiling thread"); Sleep(wait/2); /* suspend this thread to ensure that the other one starts */ #else /* not Win32 */ -#ifdef HAVE_PTHREAD + +# ifdef HAVE_PTHREAD R_profiled_thread = pthread_self(); -#else +# else error("profiling requires 'pthread' support"); -#endif +# endif -#if defined(__APPLE__) - /* see comment in doprof for why R_profiled_thread is not enough */ - R_profiled_thread_id = mach_thread_self(); - mach_port_deallocate(mach_task_self(), R_profiled_thread_id); -#endif +# if defined(__APPLE__) + if (R_Profiling_Event == RPE_CPU) { + /* see comment in doprof for why R_profiled_thread is not enough */ + R_profiled_thread_id = mach_thread_self(); + mach_port_deallocate(mach_task_self(), R_profiled_thread_id); + } +# endif signal(SIGPROF, doprof); - /* The macOS implementation requires normalization here: - - setitimer is obsolescent (POSIX >= 2008), replaced by - timer_create / timer_settime, but the supported clocks are - implementation-dependent. - - Recent Linux has CLOCK_PROCESS_CPUTIME_ID - Solaris has CLOCK_PROF, in -lrt. - FreeBSD only supports CLOCK_{REALTIME,MONOTONIC} - Seems not to be supported at all on macOS. - */ - itv.it_interval.tv_sec = interval / 1000000; - itv.it_interval.tv_usec = - (suseconds_t)(interval - itv.it_interval.tv_sec * 1000000); - itv.it_value.tv_sec = interval / 1000000; - itv.it_value.tv_usec = - (suseconds_t)(interval - itv.it_value.tv_sec * 1000000); - if (setitimer(ITIMER_PROF, &itv, NULL) == -1) - R_Suicide("setting profile timer failed"); + if (R_Profiling_Event == RPE_WALL) { +# ifdef HAVE_PTHREAD + R_profile_thread_info_t *nfo = &R_Profile_Thread_Info; + + pthread_mutex_init(&nfo->terminate_mu, NULL); + pthread_cond_init(&nfo->terminate_cv, NULL); + nfo->should_terminate = 0; + nfo->interval_us = interval; + sigset_t all, old_set; + sigfillset(&all); + pthread_sigmask(SIG_BLOCK, &all, &old_set); + if (pthread_create(&nfo->thread, NULL, ProfileThread, + nfo)) + R_Suicide("unable to create profiling thread"); + pthread_sigmask(SIG_SETMASK, &old_set, NULL); + +# ifdef HAVE_SCHED_H + /* attempt to set FIFO scheduling with maximum priority + at least on Linux it requires special permissions */ + struct sched_param p; + p.sched_priority = sched_get_priority_max(SCHED_FIFO); + int res = -1; + if (p.sched_priority >= 0) + res = pthread_setschedparam(nfo->thread, SCHED_FIFO, &p); + if (res) { + /* attempt to set maximum priority at least with + the current scheduling policy */ + int policy; + if (!pthread_getschedparam(nfo->thread, &policy, &p)) { + p.sched_priority = sched_get_priority_max(policy); + if (p.sched_priority >= 0) + pthread_setschedparam(nfo->thread, policy, &p); + } + } +# endif +# endif + } else if (R_Profiling_Event == RPE_CPU) { + /* The macOS implementation requires normalization here: + + setitimer is obsolescent (POSIX >= 2008), replaced by + timer_create / timer_settime, but the supported clocks are + implementation-dependent. + + Recent Linux has CLOCK_PROCESS_CPUTIME_ID + Solaris has CLOCK_PROF, in -lrt. + FreeBSD only supports CLOCK_{REALTIME,MONOTONIC} + Seems not to be supported at all on macOS. + */ + struct itimerval itv; + itv.it_interval.tv_sec = interval / 1000000; + itv.it_interval.tv_usec = + (suseconds_t)(interval - itv.it_interval.tv_sec * 1000000); + itv.it_value.tv_sec = interval / 1000000; + itv.it_value.tv_usec = + (suseconds_t)(interval - itv.it_value.tv_sec * 1000000); + if (setitimer(ITIMER_PROF, &itv, NULL) == -1) + R_Suicide("setting profile timer failed"); + } #endif /* not Win32 */ R_Profiling = 1; } @@ -743,6 +849,8 @@ SEXP do_Rprof(SEXP args) filter_callframes; double dinterval; int numfiles, bufsize; + const char *event_arg; + rpe_type event; #ifdef BC_PROFILING if (bc_profiling) { @@ -762,9 +870,28 @@ SEXP do_Rprof(SEXP args) numfiles = asInteger(CAR(args)); args = CDR(args); if (numfiles < 0) error(_("invalid '%s' argument"), "numfiles"); - bufsize = asInteger(CAR(args)); + bufsize = asInteger(CAR(args)); args = CDR(args); if (bufsize < 0) error(_("invalid '%s' argument"), "bufsize"); + if (!isString(CAR(args)) || length(CAR(args)) != 1 + || STRING_ELT(CAR(args), 0) == NA_STRING) + error(_("invalid '%s' argument"), "event"); + event_arg = translateChar(STRING_ELT(CAR(args), 0)); +#ifdef Win32 + if (streql(event_arg, "wall") || streql(event_arg, "default")) + event = RPE_WALL; + else if (streql(event_arg, "cpu")) + error("event type '%s' not supported on this platform", event_arg); + else + error(_("invalid '%s' argument"), "event"); +#else + if (streql(event_arg, "cpu") || streql(event_arg, "default")) + event = RPE_CPU; + else if (streql(event_arg, "wall")) + event = RPE_WALL; + else + error(_("invalid '%s' argument"), "event"); +#endif #if defined(linux) || defined(__linux__) if (dinterval < 0.01) { @@ -782,7 +909,7 @@ SEXP do_Rprof(SEXP args) if (LENGTH(filename)) R_InitProfiling(filename, append_mode, dinterval, mem_profiling, gc_profiling, line_profiling, filter_callframes, - numfiles, bufsize); + numfiles, bufsize, event); else R_EndProfiling(); return R_NilValue;