diff --git a/src/coreclr/debug/crashreport/CMakeLists.txt b/src/coreclr/debug/crashreport/CMakeLists.txt index f88699a4c6a464..eedc3715be4536 100644 --- a/src/coreclr/debug/crashreport/CMakeLists.txt +++ b/src/coreclr/debug/crashreport/CMakeLists.txt @@ -3,6 +3,7 @@ set(CMAKE_INCLUDE_CURRENT_DIR ON) set(CRASHREPORT_SOURCES signalsafejsonwriter.cpp inproccrashreporter.cpp + inproccrashreportwatchdog.cpp ) add_library(inproccrashreport OBJECT ${CRASHREPORT_SOURCES}) diff --git a/src/coreclr/debug/crashreport/inproccrashreporter.cpp b/src/coreclr/debug/crashreport/inproccrashreporter.cpp index fe771432eee5f4..d5940d94e757d9 100644 --- a/src/coreclr/debug/crashreport/inproccrashreporter.cpp +++ b/src/coreclr/debug/crashreport/inproccrashreporter.cpp @@ -6,12 +6,14 @@ // Streams a createdump-shaped JSON skeleton to a crashreport.json file. #include "inproccrashreporter.h" +#include "inproccrashreportwatchdog.h" #include "signalsafejsonwriter.h" #include "pal.h" #include #include +#include #include #include #include @@ -233,6 +235,7 @@ InProcCrashReporter::CreateReport( { return; } + CrashReportWatchdogScope watchdogScope; char reportPath[CRASHREPORT_PATH_BUFFER_SIZE]; reportPath[0] = '\0'; @@ -345,6 +348,8 @@ InProcCrashReporter::Initialize( m_enumerateThreadsCallback = settings.enumerateThreadsCallback; CrashReportHelpers::CopyString(m_reportPath, sizeof(m_reportPath), settings.reportPath); + (void)CrashReportWatchdog::TryInitialize(settings.timeoutSeconds); + m_processName[0] = '\0'; #if defined(__ANDROID__) // On Android every app forks from the Zygote, so /proc/self/exe always diff --git a/src/coreclr/debug/crashreport/inproccrashreporter.h b/src/coreclr/debug/crashreport/inproccrashreporter.h index 5018f3b0d10793..4b3eb3c52ed9a9 100644 --- a/src/coreclr/debug/crashreport/inproccrashreporter.h +++ b/src/coreclr/debug/crashreport/inproccrashreporter.h @@ -60,6 +60,7 @@ using InProcCrashReportEnumerateThreadsCallback = void (*)( struct InProcCrashReporterSettings { const char* reportPath; + uint32_t timeoutSeconds; InProcCrashReportIsManagedThreadCallback isManagedThreadCallback; InProcCrashReportWalkStackCallback walkStackCallback; InProcCrashReportEnumerateThreadsCallback enumerateThreadsCallback; diff --git a/src/coreclr/debug/crashreport/inproccrashreportwatchdog.cpp b/src/coreclr/debug/crashreport/inproccrashreportwatchdog.cpp new file mode 100644 index 00000000000000..15255570c1cb60 --- /dev/null +++ b/src/coreclr/debug/crashreport/inproccrashreportwatchdog.cpp @@ -0,0 +1,375 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +// POSIX watchdog for in-proc crash reporting. + +#include "inproccrashreportwatchdog.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static constexpr uint32_t CRASH_REPORT_WATCHDOG_SECONDS_TO_MILLISECONDS = 1000; +static constexpr int CRASH_REPORT_WATCHDOG_SIGNAL_EXIT_CODE_OFFSET = 128; + +LONG CrashReportWatchdog::s_initializationStarted; +CrashReportWatchdog* CrashReportWatchdog::s_instance; +volatile sig_atomic_t CrashReportWatchdog::s_writeFd = -1; + +uint32_t +CrashReportWatchdog::ClampTimeoutSeconds(uint32_t timeoutSeconds) +{ + uint32_t maxTimeoutSeconds = static_cast(std::numeric_limits::max() / CRASH_REPORT_WATCHDOG_SECONDS_TO_MILLISECONDS); + if (timeoutSeconds > maxTimeoutSeconds) + { + return maxTimeoutSeconds; + } + + return timeoutSeconds; +} + +CrashReportWatchdog::CrashReportWatchdog(uint32_t timeoutSeconds) + : m_timeoutSeconds(ClampTimeoutSeconds(timeoutSeconds)), + m_timeoutMs(static_cast(m_timeoutSeconds * CRASH_REPORT_WATCHDOG_SECONDS_TO_MILLISECONDS)), + m_thread() +{ + m_pipe[0] = -1; + m_pipe[1] = -1; +} + +CrashReportWatchdog::~CrashReportWatchdog() +{ + ClosePipe(); +} + +bool +CrashReportWatchdog::TryInitialize(uint32_t timeoutSeconds) +{ + if (timeoutSeconds == 0) + { + return false; + } + + if (InterlockedCompareExchange(&s_initializationStarted, 1, 0) != 0) + { + return s_writeFd != -1; + } + + // The watchdog is best-effort. The one-time flag prevents duplicate + // watchdog threads, but setup failures reset it so a later init can retry. + CrashReportWatchdog* watchdog = new (std::nothrow) CrashReportWatchdog(timeoutSeconds); + if (watchdog == nullptr) + { + InterlockedExchange(&s_initializationStarted, 0); + return false; + } + + if (!watchdog->Initialize()) + { + delete watchdog; + InterlockedExchange(&s_initializationStarted, 0); + return false; + } + + // Keep the watchdog alive for process lifetime; the detached thread and + // pipe remain available after initialization succeeds. + s_instance = watchdog; + s_writeFd = watchdog->m_pipe[1]; + return true; +} + +bool +CrashReportWatchdog::Initialize() +{ + if (!InitializePipe()) + { + return false; + } + + // Block fatal signals before pthread_create so the watchdog inherits the + // mask; restore this thread's mask immediately after creation. This keeps + // process-directed fatal signals from landing on the watchdog thread. + sigset_t signalSet; + sigset_t previousSignalSet; + BuildFatalSignalSet(&signalSet); + int maskResult = pthread_sigmask(SIG_BLOCK, &signalSet, &previousSignalSet); + if (maskResult != 0) + { + ClosePipe(); + return false; + } + + if (pthread_create(&m_thread, nullptr, ThreadEntry, this) != 0) + { + (void)pthread_sigmask(SIG_SETMASK, &previousSignalSet, nullptr); + ClosePipe(); + return false; + } + + (void)pthread_sigmask(SIG_SETMASK, &previousSignalSet, nullptr); + + (void)pthread_detach(m_thread); + return true; +} + +bool +CrashReportWatchdog::InitializePipe() +{ + if (pipe(m_pipe) != 0) + { + return false; + } + + if (!ConfigurePipeFd(m_pipe[0]) || + !ConfigurePipeFd(m_pipe[1])) + { + ClosePipe(); + return false; + } + + return true; +} + +bool +CrashReportWatchdog::ConfigurePipeFd(int fd) +{ +#ifdef FD_CLOEXEC + int descriptorFlags = fcntl(fd, F_GETFD); + if (descriptorFlags == -1 || fcntl(fd, F_SETFD, descriptorFlags | FD_CLOEXEC) != 0) + { + return false; + } +#endif + + int statusFlags = fcntl(fd, F_GETFL); + return statusFlags != -1 && fcntl(fd, F_SETFL, statusFlags | O_NONBLOCK) == 0; +} + +void +CrashReportWatchdog::ClosePipe() +{ + if (m_pipe[0] != -1) + { + close(m_pipe[0]); + m_pipe[0] = -1; + } + + if (m_pipe[1] != -1) + { + close(m_pipe[1]); + m_pipe[1] = -1; + } +} + +void +CrashReportWatchdog::BuildFatalSignalSet(sigset_t* signalSet) +{ + sigemptyset(signalSet); + sigaddset(signalSet, SIGABRT); + sigaddset(signalSet, SIGBUS); + sigaddset(signalSet, SIGFPE); + sigaddset(signalSet, SIGILL); + sigaddset(signalSet, SIGSEGV); + sigaddset(signalSet, SIGTRAP); +} + +void* +CrashReportWatchdog::ThreadEntry(void* context) +{ + CrashReportWatchdog* watchdog = static_cast(context); + if (watchdog != nullptr) + { + watchdog->ThreadLoop(); + } + + return nullptr; +} + +void +CrashReportWatchdog::ThreadLoop() +{ + sigset_t signalSet; + BuildFatalSignalSet(&signalSet); + (void)pthread_sigmask(SIG_BLOCK, &signalSet, nullptr); + + // Keep within minipal's portable 15-character limit to avoid truncation. + (void)minipal_set_thread_name(pthread_self(), ".NET CrashWdg"); + + if (!WaitForCommand(Command::Started)) + { + // The watchdog is best-effort: if the notification pipe is broken, the + // watchdog can no longer observe crash-report progress. Retrying would + // spin forever, so leave termination to the platform's normal handling. + minipal_log_write_error( + "In-proc crash report watchdog failed while waiting for a start notification; exiting watchdog thread.\n"); + return; + } + + minipal_log_print_info( + "In-proc crash report watchdog started monitoring with a %lu second timeout.\n", + static_cast(m_timeoutSeconds)); + + if (!WaitForCommand(Command::Finished, m_timeoutMs)) + { + minipal_log_write_error( + "In-proc crash report watchdog did not receive a finish notification before the timeout; aborting process.\n"); + Abort(); + } +} + +bool +CrashReportWatchdog::WaitForCommand(Command expectedCommand, int timeoutMs) +{ + int readFd = m_pipe[0]; + if (readFd == -1) + { + return false; + } + + int64_t deadlineMs = 0; + if (timeoutMs != CRASH_REPORT_WATCHDOG_INFINITE_TIMEOUT_MS) + { + deadlineMs = minipal_lowres_ticks() + timeoutMs; + } + + while (true) + { + int currentTimeoutMs = timeoutMs; + if (timeoutMs != CRASH_REPORT_WATCHDOG_INFINITE_TIMEOUT_MS) + { + currentTimeoutMs = GetRemainingTimeoutMs(deadlineMs); + if (currentTimeoutMs == 0) + { + return false; + } + } + + struct pollfd pollFd; + pollFd.fd = readFd; + pollFd.events = POLLIN; + pollFd.revents = 0; + + int pollResult = poll(&pollFd, 1, currentTimeoutMs); + if (pollResult == -1) + { + if (errno == EINTR) + { + continue; + } + + return false; + } + + if (pollResult == 0 || (pollFd.revents & POLLIN) == 0) + { + return false; + } + + char command; + ssize_t readResult = read(readFd, &command, sizeof(command)); + if (readResult == sizeof(command)) + { + if (command == static_cast(expectedCommand)) + { + return true; + } + + continue; + } + + if (readResult == -1 && (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)) + { + continue; + } + + return false; + } +} + +// Called from CrashReportWatchdogScope on the crash-reporting path. Keep this +// async-signal-safe and preserve errno so watchdog notification does not +// perturb the failing thread's crash-reporting state. +void +CrashReportWatchdog::WriteCommand(Command command) +{ + int savedErrno = errno; + sig_atomic_t writeFd = s_writeFd; + if (writeFd != -1) + { + char commandValue = static_cast(command); + while (true) + { + ssize_t writeResult = write(static_cast(writeFd), &commandValue, sizeof(commandValue)); + if (writeResult == sizeof(commandValue)) + { + break; + } + + if (writeResult != -1 || errno != EINTR) + { + break; + } + } + } + + errno = savedErrno; +} + +int +CrashReportWatchdog::GetRemainingTimeoutMs(int64_t deadlineMs) +{ + int64_t remainingMs = deadlineMs - minipal_lowres_ticks(); + if (remainingMs <= 0) + { + return 0; + } + + int maxTimeoutMs = std::numeric_limits::max(); + if (remainingMs > maxTimeoutMs) + { + return maxTimeoutMs; + } + + return static_cast(remainingMs); +} + +// Terminate from the watchdog thread using the default SIGABRT action. The +// watchdog only gets here after the crash reporter started but did not finish +// before its configured timeout. +void +CrashReportWatchdog::Abort() +{ + struct sigaction action; + memset(&action, 0, sizeof(action)); + action.sa_handler = SIG_DFL; + sigemptyset(&action.sa_mask); + (void)sigaction(SIGABRT, &action, nullptr); + + sigset_t signalSet; + sigemptyset(&signalSet); + sigaddset(&signalSet, SIGABRT); + (void)pthread_sigmask(SIG_UNBLOCK, &signalSet, nullptr); + + abort(); + _exit(CRASH_REPORT_WATCHDOG_SIGNAL_EXIT_CODE_OFFSET + SIGABRT); +} + +CrashReportWatchdogScope::CrashReportWatchdogScope() +{ + CrashReportWatchdog::WriteCommand(CrashReportWatchdog::Command::Started); +} + +CrashReportWatchdogScope::~CrashReportWatchdogScope() +{ + CrashReportWatchdog::WriteCommand(CrashReportWatchdog::Command::Finished); +} diff --git a/src/coreclr/debug/crashreport/inproccrashreportwatchdog.h b/src/coreclr/debug/crashreport/inproccrashreportwatchdog.h new file mode 100644 index 00000000000000..628d522d10b393 --- /dev/null +++ b/src/coreclr/debug/crashreport/inproccrashreportwatchdog.h @@ -0,0 +1,73 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +// POSIX watchdog for in-proc crash reporting. + +#pragma once + +#include "pal.h" + +#include +#include +#include + +class CrashReportWatchdogScope; + +class CrashReportWatchdog +{ +public: + // Attempts to initialize the watchdog during normal runtime startup. This is + // not async-signal-safe and must not be called from the crash-reporting path. + static bool TryInitialize(uint32_t timeoutSeconds); + +private: + friend class CrashReportWatchdogScope; + + static constexpr int CRASH_REPORT_WATCHDOG_INFINITE_TIMEOUT_MS = -1; + + enum class Command : char + { + Started = 'S', + Finished = 'F', + }; + + static uint32_t ClampTimeoutSeconds(uint32_t timeoutSeconds); + + explicit CrashReportWatchdog(uint32_t timeoutSeconds); + ~CrashReportWatchdog(); + + bool Initialize(); + bool InitializePipe(); + bool ConfigurePipeFd(int fd); + void ClosePipe(); + + static void BuildFatalSignalSet(sigset_t* signalSet); + static void* ThreadEntry(void* context); + + void ThreadLoop(); + bool WaitForCommand(Command expectedCommand, int timeoutMs = CRASH_REPORT_WATCHDOG_INFINITE_TIMEOUT_MS); + static void WriteCommand(Command command); + static int GetRemainingTimeoutMs(int64_t deadlineMs); + static void Abort(); + + uint32_t m_timeoutSeconds; + int m_timeoutMs; + pthread_t m_thread; + int m_pipe[2]; + + static LONG s_initializationStarted; + static CrashReportWatchdog* s_instance; + static volatile sig_atomic_t s_writeFd; +}; + +class CrashReportWatchdogScope +{ +public: + // The constructor and destructor run in the crash-reporting path. Keep them + // async-signal-safe: they may only notify the pre-created watchdog channel. + CrashReportWatchdogScope(); + ~CrashReportWatchdogScope(); + + CrashReportWatchdogScope(const CrashReportWatchdogScope&) = delete; + CrashReportWatchdogScope& operator=(const CrashReportWatchdogScope&) = delete; +}; diff --git a/src/coreclr/vm/crashreportstackwalker.cpp b/src/coreclr/vm/crashreportstackwalker.cpp index 1670ec970d91ff..ae984ce4804ba8 100644 --- a/src/coreclr/vm/crashreportstackwalker.cpp +++ b/src/coreclr/vm/crashreportstackwalker.cpp @@ -24,6 +24,10 @@ struct WalkContext }; static void BuildTypeName(LPUTF8 buffer, size_t bufferSize, LPCUTF8 namespaceName, LPCUTF8 className); +static bool IsDecimalDigits(const char* value); +// Parses configuration during CrashReportConfigure initialization. This is not +// async-signal-safe and must not be called from the crash-reporting path. +static DWORD GetCrashReportTimeoutSeconds(); static StackWalkAction @@ -434,6 +438,7 @@ CrashReportConfigure() InProcCrashReporterSettings settings = {}; settings.reportPath = dumpName; + settings.timeoutSeconds = GetCrashReportTimeoutSeconds(); settings.isManagedThreadCallback = CrashReportIsCurrentThreadManaged; settings.walkStackCallback = CrashReportWalkStack; settings.enumerateThreadsCallback = CrashReportEnumerateThreads; @@ -443,4 +448,48 @@ CrashReportConfigure() InProcCrashReportInitialize(settings); } +static bool +IsDecimalDigits(const char* value) +{ + if (value == nullptr || value[0] == '\0') + { + return false; + } + + for (const char* current = value; *current != '\0'; current++) + { + if (*current < '0' || *current > '9') + { + return false; + } + } + + return true; +} + +// Parses configuration during CrashReportConfigure initialization. This is not +// async-signal-safe and must not be called from the crash-reporting path. +static DWORD +GetCrashReportTimeoutSeconds() +{ + // Keep the default conservative: successful reports can be large, while 0 + // remains available to disable the watchdog for diagnostics. + static constexpr DWORD DefaultTimeoutSeconds = 30; + + CLRConfigNoCache timeoutCfg = CLRConfigNoCache::Get("CrashReportTimeoutSeconds", /*noprefix*/ false, &getenv); + if (!timeoutCfg.IsSet()) + { + return DefaultTimeoutSeconds; + } + + const char* timeoutString = timeoutCfg.AsString(); + DWORD timeoutSeconds; + if (!IsDecimalDigits(timeoutString) || !timeoutCfg.TryAsInteger(10, timeoutSeconds)) + { + return DefaultTimeoutSeconds; + } + + return timeoutSeconds; +} + #endif // FEATURE_INPROC_CRASHREPORT