17 #include <ndb_global.h>
18 #include <ndb_version.h>
23 #include <NdbConfig.h>
24 #include <NdbAutoPtr.hpp>
25 #include <portlib/ndb_daemon.h>
26 #include <portlib/NdbSleep.h>
27 #include <portlib/NdbDir.hpp>
29 #include <ConfigRetriever.hpp>
31 #include <EventLogger.hpp>
37 ndb_daemon_exit(code);
40 #include "../mgmapi/mgmapi_configuration.hpp"
44 NodeId nodeid,
int error_exit,
45 bool restart,
bool nostart,
bool initial,
46 Uint32 error, Uint32 signum, Uint32 sphase)
50 (!restart && !initial && !nostart));
52 Uint32 length, theData[25];
55 rep->setNodeId(nodeid);
79 g_eventLogger->
log(rep->getEventType(), theData, length,
84 for (iter.first(); iter.valid(); iter.next())
87 if (iter.get(CFG_TYPE_OF_SECTION, &type) ||
88 type != NODE_TYPE_MGM)
92 if (iter.get(CFG_MGM_PORT, &port))
96 if (iter.get(CFG_NODE_HOST, &hostname))
100 connect_str.
assfmt(
"%s:%d", hostname, port);
106 g_eventLogger->
warning(
"Unable to report shutdown reason "
107 "to '%s'(failed to create mgm handle)",
108 connect_str.
c_str());
114 ndb_mgm_report_event(h, theData, length))
116 g_eventLogger->
warning(
"Unable to report shutdown reason "
117 "to '%s'(error: %s - %s)",
131 static const int ignore_list[] = {
141 #elif defined SIGINFO
177 for(
size_t i = 0;
i <
sizeof(ignore_list)/
sizeof(ignore_list[0]);
i++)
178 signal(ignore_list[
i], SIG_IGN);
183 int pipe(
int pipefd[2]){
184 const unsigned int buffer_size = 4096;
186 return _pipe(pipefd, buffer_size, flags);
194 static const int WNOHANG = 37;
197 pid_t waitpid(pid_t pid,
int *stat_loc,
int options)
200 assert(options == WNOHANG);
203 HANDLE handle = OpenProcess(PROCESS_ALL_ACCESS, FALSE, pid);
206 g_eventLogger->
error(
"waitpid: Could not open handle for pid %d, "
207 "error: %d", pid, GetLastError());
212 if (!GetExitCodeProcess(handle, &exit_code))
214 g_eventLogger->
error(
"waitpid: GetExitCodeProcess failed, pid: %d, "
215 "error: %d", pid, GetLastError());
221 if (exit_code == STILL_ACTIVE)
227 *stat_loc = exit_code;
233 bool WIFEXITED(
int status)
239 int WEXITSTATUS(
int status)
245 bool WIFSIGNALED(
int status)
251 int WTERMSIG(
int status)
257 kill(pid_t pid,
int sig)
259 int retry_open_event = 10;
261 char shutdown_event_name[32];
262 _snprintf(shutdown_event_name,
sizeof(shutdown_event_name),
263 "ndbd_shutdown_%d", pid);
266 HANDLE shutdown_event;
267 while ((shutdown_event =
268 OpenEvent(EVENT_MODIFY_STATE, FALSE, shutdown_event_name)) == NULL)
275 HANDLE process = OpenProcess(SYNCHRONIZE | PROCESS_QUERY_INFORMATION,
283 if (!GetExitCodeProcess(process,&exit_code))
285 g_eventLogger->
error(
"GetExitCodeProcess failed, pid: %d, error: %d",
286 pid, GetLastError());
287 CloseHandle(process);
290 CloseHandle(process);
292 if (exit_code != STILL_ACTIVE)
298 if (retry_open_event--)
302 g_eventLogger->
error(
"Failed to open shutdown_event '%s', error: %d",
303 shutdown_event_name, GetLastError());
308 if (SetEvent(shutdown_event) == 0)
310 g_eventLogger->
error(
"Failed to signal shutdown_event '%s', error: %d",
311 shutdown_event_name, GetLastError());
313 CloseHandle(shutdown_event);
318 extern int real_main(
int,
char**);
324 char **argv = (
char **)malloc(
sizeof(
char*) * (args.size() + 1));
328 for(
unsigned i = 0;
i < args.size();
i++)
329 argv[
i] = strdup(args[
i].c_str());
330 argv[args.size()] = NULL;
336 void free_argv(
char** argv)
354 DWORD len = GetModuleFileName(NULL, path,
sizeof(path));
355 if (len == 0 || len ==
sizeof(path))
357 g_eventLogger->
warning(
"spawn_process: Could not extract full path, "
358 "len: %u, error: %u\n",
359 len, GetLastError());
368 char** argv = create_argv(args);
371 g_eventLogger->
error(
"spawn_process: Failed to create argv, errno: %d",
378 intptr_t spawn_handle = _spawnv(P_NOWAIT, progname, argv);
379 if (spawn_handle == -1)
381 g_eventLogger->
error(
"spawn_process: Failed to spawn process, errno: %d",
384 g_eventLogger->
error(
" progname: '%s'", progname);
387 g_eventLogger->
error(
"argv: '%s'", *argp++);
395 DWORD pid = GetProcessId((HANDLE)spawn_handle);
398 g_eventLogger->
error(
"spawn_process: Failed to convert handle %d "
399 "to pid, error: %d", spawn_handle, GetLastError());
400 CloseHandle((HANDLE)spawn_handle);
403 CloseHandle((HANDLE)spawn_handle);
409 g_eventLogger->
error(
"Failed to fork, errno: %d", errno);
428 (void)real_main(argc, argv);
443 const unsigned max_retries = 10;
444 unsigned retry_counter = 0;
447 pid_t pid = spawn_process(progname, args);
450 if (retry_counter++ == max_retries)
452 g_eventLogger->
error(
"Angel failed to spawn %d times, giving up",
457 g_eventLogger->
warning(
"Angel failed to spawn, sleep and retry");
459 NdbSleep_SecSleep(1);
466 static Uint32 stop_on_error;
467 static Uint32 config_max_start_fail_retries;
468 static Uint32 config_restart_delay_secs;
478 Uint32 generation = 0;
480 if (sys_iter.get(CFG_SYS_CONFIG_GENERATION, &generation))
482 g_eventLogger->
warning(
"Configuration didn't contain generation "
483 "(likely old ndb_mgmd");
485 g_eventLogger->
debug(
"Using configuration with generation %u", generation);
488 if (iter.find(CFG_NODE_ID, nodeid))
490 g_eventLogger->
error(
"Invalid configuration fetched, could not "
491 "find own node id %d", nodeid);
495 if (iter.get(CFG_DB_STOP_ON_ERROR, &stop_on_error))
497 g_eventLogger->
error(
"Invalid configuration fetched, could not "
501 g_eventLogger->
debug(
"Using StopOnError: %u", stop_on_error);
503 if (iter.get(CFG_DB_MAX_START_FAIL, &config_max_start_fail_retries))
506 config_max_start_fail_retries = 3;
509 if (iter.get(CFG_DB_START_FAIL_DELAY_SECS, &config_restart_delay_secs))
512 config_restart_delay_secs = 0;
515 const char * datadir;
516 if (iter.get(CFG_NODE_DATADIR, &datadir))
518 g_eventLogger->
error(
"Invalid configuration fetched, could not "
522 g_eventLogger->
debug(
"Using DataDir: %s", datadir);
524 NdbConfig_SetPath(datadir);
526 if (NdbDir::chdir(NdbConfig_get_path(NULL)) != 0)
528 g_eventLogger->
warning(
"Cannot change directory to '%s', error: %d",
529 NdbConfig_get_path(NULL), errno);
536 bool stop_child =
false;
539 angel_run(
const char* progname,
541 const char* connect_str,
543 const char* bind_address,
553 if (retriever.hasError())
555 g_eventLogger->
error(
"Could not initialize connection to management "
556 "server, error: '%s'", retriever.getErrorString());
560 const int connnect_retries = 12;
561 const int connect_delay = 5;
562 const int verbose = 1;
563 if (retriever.do_connect(connnect_retries, connect_delay, verbose) != 0)
565 g_eventLogger->
error(
"Could not connect to management server, "
566 "error: '%s'", retriever.getErrorString());
569 g_eventLogger->
info(
"Angel connected to '%s:%d'",
570 retriever.get_mgmd_host(),
571 retriever.get_mgmd_port());
573 const int alloc_retries = 2;
574 const int alloc_delay = 3;
575 const Uint32 nodeid = retriever.allocNodeId(alloc_retries, alloc_delay);
578 g_eventLogger->
error(
"Failed to allocate nodeid, error: '%s'",
579 retriever.getErrorString());
582 g_eventLogger->
info(
"Angel allocated nodeid: %u", nodeid);
588 g_eventLogger->
error(
"Could not fetch configuration/invalid "
589 "configuration, error: '%s'",
590 retriever.getErrorString());
594 if (!configure(config, nodeid))
603 char *lockfile = NdbConfig_PidFileName(nodeid);
604 char *logfile = NdbConfig_StdoutFileName(nodeid);
607 if (ndb_daemonize(lockfile, logfile) != 0)
609 g_eventLogger->
error(
"Couldn't start as daemon, error: '%s'",
616 Uint32 failed_startups_counter = 0;
624 g_eventLogger->
error(
"Failed to create pipe, errno: %d (%s)",
625 errno, strerror(errno));
630 if (!(child_info_r = fdopen(fds[0],
"r")))
632 g_eventLogger->
error(
"Failed to open stream for pipe, errno: %d (%s)",
633 errno, strerror(errno));
642 args = original_args;
646 one_arg.
assfmt(
"--report-fd=%d", fds[1]);
647 args.push_back(one_arg);
650 one_arg.
assfmt(
"--allocated-nodeid=%d", nodeid);
651 args.push_back(one_arg);
653 one_arg.
assfmt(
"--initial=%d", initial);
654 args.push_back(one_arg);
656 one_arg.
assfmt(
"--nostart=%d", no_start);
657 args.push_back(one_arg);
659 pid_t child = retry_spawn_process(progname, args);
663 g_eventLogger->
error(
"retry_spawn_process, child: %d", child);
670 g_eventLogger->
info(
"Angel pid: %d started child: %d",
675 int status=0, error_exit=0;
678 pid_t ret_pid = waitpid(child, &status, WNOHANG);
679 if (ret_pid == child)
681 g_eventLogger->
debug(
"Angel got child %d", child);
686 g_eventLogger->
warning(
"Angel got unexpected pid %d "
687 "when waiting for %d",
693 g_eventLogger->
info(
"Angel shutting down ndbd with pid %d", child);
696 NdbSleep_MilliSleep(100);
704 Uint32 child_error = 0, child_signal = 0, child_sphase = 0;
705 while (fgets(buf,
sizeof (buf), child_info_r))
708 if (sscanf(buf,
"error=%d\n", &value) == 1)
710 else if (sscanf(buf,
"signal=%d\n", &value) == 1)
711 child_signal = value;
712 else if (sscanf(buf,
"sphase=%d\n", &value) == 1)
713 child_sphase = value;
714 else if (strcmp(buf,
"\n") != 0)
715 fprintf(stderr,
"unknown info from child: '%s'\n", buf);
717 g_eventLogger->
debug(
"error: %u, signal: %u, sphase: %u",
718 child_error, child_signal, child_sphase);
720 fclose(child_info_r);
722 if (WIFEXITED(status))
724 switch (WEXITSTATUS(status)) {
726 g_eventLogger->
info(
"Angel shutting down");
727 reportShutdown(config, nodeid, 0, 0,
false,
false,
728 child_error, child_signal, child_sphase);
731 case NRT_NoStart_Restart:
735 case NRT_NoStart_InitialStart:
739 case NRT_DoStart_InitialStart:
750 reportShutdown(config, nodeid,
751 error_exit, 0,
false,
false,
752 child_error, child_signal, child_sphase);
756 case NRT_DoStart_Restart:
764 if (WIFSIGNALED(status))
766 child_signal = WTERMSIG(status);
771 g_eventLogger->
info(
"Unknown exit reason. Stopped.");
778 reportShutdown(config, nodeid,
779 error_exit, 0,
false,
false,
780 child_error, child_signal, child_sphase);
786 const Uint32 STARTUP_FAILURE_SPHASE = 6;
787 Uint32 restart_delay_secs = 0;
789 child_sphase <= STARTUP_FAILURE_SPHASE)
791 if (++failed_startups_counter >= config_max_start_fail_retries)
793 g_eventLogger->
alert(
"Angel detected too many startup failures(%d), "
794 "not restarting again", failed_startups_counter);
795 reportShutdown(config, nodeid,
796 error_exit, 0,
false,
false,
797 child_error, child_signal, child_sphase);
800 g_eventLogger->
info(
"Angel detected startup failure, count: %u",
801 failed_startups_counter);
803 restart_delay_secs = config_restart_delay_secs;
808 failed_startups_counter = 0;
811 reportShutdown(config, nodeid,
815 child_error, child_signal, child_sphase);
816 g_eventLogger->
info(
"Ndb has terminated (pid %d) restarting", child);
818 g_eventLogger->
debug(
"Angel reconnecting to management server");
819 (void)retriever.disconnect();
821 if (restart_delay_secs > 0)
823 g_eventLogger->
info(
"Delaying Ndb restart for %u seconds.",
825 NdbSleep_SecSleep(restart_delay_secs);
828 const int connnect_retries = 12;
829 const int connect_delay = 5;
830 const int verbose = 1;
831 if (retriever.do_connect(connnect_retries, connect_delay, verbose) != 0)
833 g_eventLogger->
error(
"Could not connect to management server, "
834 "error: '%s'", retriever.getErrorString());
837 g_eventLogger->
info(
"Angel reconnected to '%s:%d'",
838 retriever.get_mgmd_host(),
839 retriever.get_mgmd_port());
842 retriever.setNodeId(nodeid);
844 g_eventLogger->
debug(
"Angel reallocating nodeid %d", nodeid);
845 const int alloc_retries = 10;
846 const int alloc_delay = 3;
847 const Uint32 realloced = retriever.allocNodeId(alloc_retries, alloc_delay);
850 g_eventLogger->
error(
"Angel failed to allocate nodeid, error: '%s'",
851 retriever.getErrorString());
854 if (realloced != nodeid)
856 g_eventLogger->
error(
"Angel failed to reallocate nodeid %d, got %d",
860 g_eventLogger->
info(
"Angel reallocated nodeid: %u", nodeid);
871 void angel_stop(
void)