19 #include <ndb_global.h>
21 #include "WatchDog.hpp"
22 #include "GlobalData.hpp"
25 #include <ErrorHandlingMacros.hpp>
26 #include <Configuration.hpp>
27 #include <EventLogger.hpp>
40 WatchDog::WatchDog(Uint32 interval) :
43 setCheckInterval(interval);
44 m_mutex = NdbMutex_Create();
49 WatchDog::~WatchDog(){
51 NdbMutex_Destroy(m_mutex);
55 WatchDog::setCheckInterval(Uint32 interval){
57 return theInterval = (interval < 70 ? 70 : interval);
61 WatchDog::registerWatchedThread(Uint32 *counter, Uint32 threadId)
65 NdbMutex_Lock(m_mutex);
67 if (m_watchedCount >= MAX_WATCHED_THREADS)
73 m_watchedList[m_watchedCount].m_watchCounter = counter;
74 m_watchedList[m_watchedCount].m_threadId = threadId;
75 NdbTick_getMicroTimer(&(m_watchedList[m_watchedCount].m_startTime));
76 m_watchedList[m_watchedCount].m_slowWarnDelay = theInterval;
77 m_watchedList[m_watchedCount].m_lastCounterValue = 0;
82 NdbMutex_Unlock(m_mutex);
87 WatchDog::unregisterWatchedThread(Uint32 threadId)
90 NdbMutex_Lock(m_mutex);
92 for (i = 0; i < m_watchedCount; i++)
94 if (threadId == m_watchedList[i].m_threadId)
97 assert(i < m_watchedCount);
98 m_watchedList[
i] = m_watchedList[m_watchedCount - 1];
101 NdbMutex_Unlock(m_mutex);
112 NDB_THREAD_PRIO_HIGH);
127 const char *get_action(Uint32 IPValue)
132 action =
"Job Handling";
135 action =
"Scanning Timers";
138 action =
"External I/O";
141 action =
"Print Job Buffers at crash";
144 action =
"Checking connections";
147 action =
"Performing Send";
150 action =
"Polling for Receive";
153 action =
"Performing Receive";
156 action =
"Allocating memory";
159 action =
"Unknown place";
175 times(
struct tms *
buf)
183 FILETIME create, exit, kernel, user;
184 if (GetProcessTimes(GetCurrentProcess(),
185 &create, &exit, &kernel, &user) == 0)
187 errno = GetLastError();
191 ULARGE_INTEGER ulint;
192 ulint.LowPart = kernel.dwLowDateTime;
193 ulint.HighPart = kernel.dwHighDateTime;
194 buf->tms_stime = (clock_t)ulint.QuadPart;
195 buf->tms_cstime = (clock_t)ulint.QuadPart;
197 ulint.LowPart = user.dwLowDateTime;
198 ulint.HighPart = user.dwHighDateTime;
199 buf->tms_utime = (clock_t)ulint.QuadPart;
200 buf->tms_cutime = (clock_t)ulint.QuadPart;
203 if (QueryPerformanceCounter(&ticks) == 0)
205 errno = GetLastError();
209 return (clock_t)ticks.QuadPart;
214 #include <sys/times.h>
220 unsigned int sleep_time;
223 Uint32 counterValue[MAX_WATCHED_THREADS];
224 Uint32 oldCounterValue[MAX_WATCHED_THREADS];
225 Uint32 threadId[MAX_WATCHED_THREADS];
227 Uint32 theIntervalCheck[MAX_WATCHED_THREADS];
228 Uint32 elapsed[MAX_WATCHED_THREADS];
230 NdbTick_getMicroTimer(&last_time);
237 NdbSleep_MilliSleep(sleep_time);
241 NdbTick_getMicroTimer(&now);
242 if (NdbTick_getMicrosPassed(last_time, now)/1000 > sleep_time*2)
246 g_eventLogger->
info(
"Watchdog: User time: %llu System time: %llu",
247 (Uint64)my_tms.tms_utime,
248 (Uint64)my_tms.tms_stime);
249 g_eventLogger->
warning(
"Watchdog: Warning overslept %llu ms, expected %u ms.",
250 NdbTick_getMicrosPassed(last_time, now)/1000,
259 NdbMutex_Lock(m_mutex);
260 numThreads = m_watchedCount;
261 for (Uint32 i = 0; i < numThreads; i++)
263 counterValue[
i] = *(m_watchedList[
i].m_watchCounter);
264 if (counterValue[i] != 0)
277 *(m_watchedList[
i].m_watchCounter) = 0;
278 m_watchedList[
i].m_startTime = now;
279 m_watchedList[
i].m_slowWarnDelay = theInterval;
280 m_watchedList[
i].m_lastCounterValue = counterValue[
i];
284 start_time[
i] = m_watchedList[
i].m_startTime;
285 threadId[
i] = m_watchedList[
i].m_threadId;
286 oldCounterValue[
i] = m_watchedList[
i].m_lastCounterValue;
287 theIntervalCheck[
i] = m_watchedList[
i].m_slowWarnDelay;
288 elapsed[
i] = (Uint32)NdbTick_getMicrosPassed(start_time[i], now)/1000;
289 if (oldCounterValue[i] == 9 && elapsed[i] >= theIntervalCheck[i])
290 m_watchedList[
i].m_slowWarnDelay += theInterval;
293 NdbMutex_Unlock(m_mutex);
300 for (Uint32 i = 0; i < numThreads; i++)
302 if (counterValue[i] != 0)
309 if (oldCounterValue[i] != 9 || elapsed[i] >= theIntervalCheck[i])
311 const char *last_stuck_action = get_action(oldCounterValue[i]);
312 g_eventLogger->
warning(
"Ndb kernel thread %u is stuck in: %s "
314 threadId[i], last_stuck_action, elapsed[i]);
318 g_eventLogger->
info(
"Watchdog: User time: %llu System time: %llu",
319 (Uint64)my_tms.tms_utime,
320 (Uint64)my_tms.tms_stime);
322 if (elapsed[i] > 3 * theInterval)
324 shutdownSystem(last_stuck_action);
333 WatchDog::shutdownSystem(
const char *last_stuck_action){
335 ErrorReporter::handleError(NDBD_EXIT_WATCHDOG_TERMINATE,