18 #include <NdbRestarts.hpp>
22 #include <kernel/ndb_limits.h>
23 #include <signaldata/DumpStateOrd.hpp>
25 #include <NDBT_Test.hpp>
27 #define F_ARGS NDBT_Context* ctx, NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart
29 int restartRandomNodeGraceful(F_ARGS);
30 int restartRandomNodeAbort(F_ARGS);
31 int restartRandomNodeError(F_ARGS);
32 int restartRandomNodeInitial(F_ARGS);
33 int restartNFDuringNR(F_ARGS);
34 int restartMasterNodeError(F_ARGS);
35 int twoNodeFailure(F_ARGS);
36 int fiftyPercentFail(F_ARGS);
37 int twoMasterNodeFailure(F_ARGS);
38 int restartAllNodesGracfeul(F_ARGS);
39 int restartAllNodesAbort(F_ARGS);
40 int restartAllNodesError9999(F_ARGS);
41 int fiftyPercentStopAndWait(F_ARGS);
42 int restartNodeDuringLCP(F_ARGS);
43 int stopOnError(F_ARGS);
66 NdbRestart(
"RestartRandomNode",
68 restartRandomNodeGraceful,
74 NdbRestart(
"RestartRandomNodeAbort",
76 restartRandomNodeAbort,
83 NdbRestart(
"RestartRandomNodeError",
85 restartRandomNodeError,
91 NdbRestart(
"RestartMasterNodeError",
93 restartMasterNodeError,
99 NdbRestart(
"RestartRandomNodeInitial",
101 restartRandomNodeInitial,
108 NdbRestart(
"RestartNFDuringNR",
118 NdbRestart(
"StopOnError",
132 NdbRestart(
"TwoNodeFailure",
133 MULTIPLE_NODE_RESTART,
141 NdbRestart(
"TwoMasterNodeFailure",
142 MULTIPLE_NODE_RESTART,
143 twoMasterNodeFailure,
146 NdbRestart(
"FiftyPercentFail",
147 MULTIPLE_NODE_RESTART,
161 NdbRestart(
"RestartAllNodes",
163 restartAllNodesGracfeul,
169 NdbRestart(
"RestartAllNodesAbort",
171 restartAllNodesAbort,
177 NdbRestart(
"RestartAllNodesError9999",
179 restartAllNodesError9999,
186 NdbRestart(
"FiftyPercentStopAndWait",
188 fiftyPercentStopAndWait,
193 NdbRestart(
"RestartNodeDuringLCP",
195 restartNodeDuringLCP,
199 const int NdbRestarts::m_NoOfRestarts =
sizeof(m_restarts) /
sizeof(NdbRestart);
202 const NdbRestarts::NdbErrorInsert NdbRestarts::m_errors[] = {
203 NdbErrorInsert(
"Error9999", 9999)
206 const int NdbRestarts::m_NoOfErrors =
sizeof(m_errors) /
sizeof(NdbErrorInsert);
208 NdbRestarts::NdbRestart::NdbRestart(
const char* _name,
209 NdbRestartType _type,
215 m_restartFunc = _func;
216 m_numRequiredNodes = _requiredNodes;
221 int NdbRestarts::getNumRestarts(){
222 return m_NoOfRestarts;
226 if (_num >= m_NoOfRestarts)
229 return &m_restarts[_num];
233 for(
int i = 0;
i < m_NoOfRestarts;
i++){
234 if (strcmp(m_restarts[
i].m_name, _name) == 0){
235 return &m_restarts[
i];
238 g_err <<
"The restart \""<< _name <<
"\" not found in NdbRestarts" << endl;
245 unsigned int _timeout){
249 if (_restart->m_numRequiredNodes > restarter.getNumDbNodes()){
250 g_err <<
"This test requires " << _restart->m_numRequiredNodes <<
" nodes "
251 <<
"there are only "<< restarter.getNumDbNodes() <<
" nodes in cluster"
255 if (restarter.waitClusterStarted(120) != 0){
261 int res = _restart->m_restartFunc(ctx, restarter, _restart);
264 NdbSleep_SecSleep(2);
268 while(restarter.waitClusterStarted(60) != 0)
269 g_err <<
"Cluster is not started after restart. Waiting 60s more..."
272 if (restarter.waitClusterStarted(_timeout) != 0){
273 g_err<<
"Cluster failed to start" << endl;
283 unsigned int _timeout){
288 int res = executeRestart(ctx, r, _timeout);
294 unsigned int _timeout){
299 int res = executeRestart(ctx, r, _timeout);
303 void NdbRestarts::listRestarts(NdbRestartType _type){
304 for(
int i = 0;
i < m_NoOfRestarts;
i++){
305 if (m_restarts[
i].m_type == _type)
306 ndbout <<
" " << m_restarts[
i].m_name <<
", min "
307 << m_restarts[
i].m_numRequiredNodes
312 void NdbRestarts::listRestarts(){
313 ndbout <<
"NODE RESTARTS" << endl;
314 listRestarts(NODE_RESTART);
315 ndbout <<
"MULTIPLE NODE RESTARTS" << endl;
316 listRestarts(MULTIPLE_NODE_RESTART);
317 ndbout <<
"SYSTEM RESTARTS" << endl;
318 listRestarts(SYSTEM_RESTART);
321 NdbRestarts::NdbErrorInsert::NdbErrorInsert(
const char* _name,
325 m_errorNo = _errorNo;
328 int NdbRestarts::getNumErrorInserts(){
332 const NdbRestarts::NdbErrorInsert* NdbRestarts::getError(
int _num){
333 if (_num >= m_NoOfErrors)
335 return &m_errors[_num];
338 const NdbRestarts::NdbErrorInsert* NdbRestarts::getRandomError(){
339 int randomId = myRandom48(m_NoOfErrors);
340 return &m_errors[randomId];
354 #define CHECK(b, m) { int _xx = b; if (!(_xx)) { \
355 ndbout << "ERR: "<< m \
356 << " " << "File: " << __FILE__ \
357 << " (Line: " << __LINE__ << ")" << "- " << _xx << endl; \
358 return NDBT_FAILED; } }
362 int restartRandomNodeGraceful(F_ARGS){
364 myRandom48Init((
long)NdbTick_CurrentMillisecond());
365 int randomId = myRandom48(_restarter.getNumDbNodes());
366 int nodeId = _restarter.getDbNodeId(randomId);
368 g_info << _restart->m_name <<
": node = "<<nodeId << endl;
370 CHECK(_restarter.restartOneDbNode(nodeId) == 0,
371 "Could not restart node "<<nodeId);
376 int restartRandomNodeAbort(F_ARGS){
378 myRandom48Init((
long)NdbTick_CurrentMillisecond());
379 int randomId = myRandom48(_restarter.getNumDbNodes());
380 int nodeId = _restarter.getDbNodeId(randomId);
382 g_info << _restart->m_name <<
": node = "<<nodeId << endl;
384 CHECK(_restarter.restartOneDbNode(nodeId,
false,
false,
true) == 0,
385 "Could not restart node "<<nodeId);
390 int restartRandomNodeError(F_ARGS){
392 myRandom48Init((
long)NdbTick_CurrentMillisecond());
393 int randomId = myRandom48(_restarter.getNumDbNodes());
394 int nodeId = _restarter.getDbNodeId(randomId);
396 ndbout << _restart->m_name <<
": node = "<<nodeId << endl;
398 CHECK(_restarter.insertErrorInNode(nodeId, 9999) == 0,
399 "Could not restart node "<<nodeId);
404 int restartMasterNodeError(F_ARGS){
406 int nodeId = _restarter.getDbNodeId(0);
408 g_info << _restart->m_name <<
": node = "<<nodeId << endl;
410 CHECK(_restarter.insertErrorInNode(nodeId, 39999) == 0,
411 "Could not restart node "<<nodeId);
416 int restartRandomNodeInitial(F_ARGS){
418 myRandom48Init((
long)NdbTick_CurrentMillisecond());
419 int randomId = myRandom48(_restarter.getNumDbNodes());
420 int nodeId = _restarter.getDbNodeId(randomId);
422 g_info << _restart->m_name <<
": node = "<<nodeId << endl;
424 CHECK(_restarter.restartOneDbNode(nodeId,
true) == 0,
425 "Could not restart node "<<nodeId);
430 int twoNodeFailure(F_ARGS){
432 myRandom48Init((
long)NdbTick_CurrentMillisecond());
433 int randomId = myRandom48(_restarter.getNumDbNodes());
435 n[0] = _restarter.getDbNodeId(randomId);
436 n[1] = _restarter.getRandomNodeOtherNodeGroup(n[0], rand());
437 g_info << _restart->m_name <<
": node = "<< n[0] << endl;
439 int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
440 CHECK(_restarter.dumpStateOneNode(n[0], val2, 2) == 0,
442 CHECK(_restarter.dumpStateOneNode(n[1], val2, 2) == 0,
445 CHECK(_restarter.insertErrorInNode(n[0], 9999) == 0,
446 "Could not restart node "<< n[0]);
450 int ms = (myRandom48(max)) + 1;
451 g_info <<
"Waiting for " << ms <<
"(" << max
453 NdbSleep_MilliSleep(ms);
455 g_info << _restart->m_name <<
": node = "<< n[1] << endl;
456 CHECK(_restarter.insertErrorInNode(n[1], 9999) == 0,
457 "Could not restart node "<< n[1]);
459 CHECK(_restarter.waitNodesNoStart(n, 2) == 0,
460 "Failed to wait nostart");
462 _restarter.startNodes(n, 2);
467 int twoMasterNodeFailure(F_ARGS){
470 n[0] = _restarter.getMasterNodeId();
473 n[1] = _restarter.getNextMasterNodeId(n[1]);
474 }
while(_restarter.getNodeGroup(n[0]) == _restarter.getNodeGroup(n[1]));
476 g_info << _restart->m_name <<
": ";
477 g_info <<
"node0 = "<< n[0] <<
"(" << _restarter.getNodeGroup(n[0]) <<
") ";
478 g_info <<
"node1 = "<< n[1] <<
"(" << _restarter.getNodeGroup(n[1]) <<
") ";
481 int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
482 CHECK(_restarter.dumpStateOneNode(n[0], val2, 2) == 0,
484 CHECK(_restarter.dumpStateOneNode(n[1], val2, 2) == 0,
487 CHECK(_restarter.insertErrorInNode(n[0], 9999) == 0,
488 "Could not restart node "<< n[0]);
492 int ms = (myRandom48(max)) + 1;
493 g_info <<
"Waiting for " << ms <<
"(" << max
495 NdbSleep_MilliSleep(ms);
497 g_info << _restart->m_name <<
": node = "<< n[1] << endl;
499 CHECK(_restarter.insertErrorInNode(n[1], 9999) == 0,
500 "Could not restart node "<< n[1]);
502 CHECK(_restarter.waitNodesNoStart(n, 2) == 0,
503 "Failed to wait nostart");
505 _restarter.startNodes(n, 2);
515 int num50Percent = restarter.getNumDbNodes() / 2;
516 assert(num50Percent <= MAX_NDB_NODES);
519 for (
int i = 0;
i < num50Percent;
i++){
520 _nodes[
i] = restarter.getDbNodeId(
i*2);
525 int fiftyPercentFail(F_ARGS){
528 int nodes[MAX_NDB_NODES];
530 int numNodes = get50PercentOfNodes(_restarter, nodes);
533 for (
int i = 0;
i < numNodes;
i++){
534 g_info <<
"Stopping node "<< nodes[
i] << endl;
535 int res = _restarter.restartOneDbNode(nodes[
i],
false,
true,
true);
536 CHECK(res == 0,
"Could not stop node: "<< nodes[i]);
539 CHECK(_restarter.waitNodesNoStart(nodes, numNodes) == 0,
543 ndbout <<
"Starting all nodes" << endl;
544 CHECK(_restarter.startAll() == 0,
545 "Could not start all nodes");
551 int restartAllNodesGracfeul(F_ARGS){
553 g_info << _restart->m_name << endl;
556 CHECK(_restarter.restartAll() == 0,
557 "Could not restart all nodes");
563 int restartAllNodesAbort(F_ARGS){
565 g_info << _restart->m_name << endl;
568 CHECK(_restarter.restartAll(
false,
false,
true) == 0,
569 "Could not restart all nodes");
574 int restartAllNodesError9999(F_ARGS){
576 g_info << _restart->m_name << endl;
578 int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 } ;
579 CHECK(_restarter.dumpStateAllNodes(val, 2) == 0,
580 "failed to set RestartOnErrorInsert");
582 CHECK(_restarter.insertErrorInAllNodes(932) == 0,
583 "Failed to set error 932 (auto-restart on arbit error)");
586 CHECK(_restarter.insertErrorInAllNodes(9999) == 0,
587 "Could not restart all nodes ");
589 CHECK(_restarter.waitClusterNoStart() == 0,
590 "Failed to wait not started");
592 _restarter.startAll();
597 int fiftyPercentStopAndWait(F_ARGS){
599 int nodes[MAX_NDB_NODES];
600 int numNodes = get50PercentOfNodes(_restarter, nodes);
603 for (
int i = 0;
i < numNodes;
i++){
604 g_info <<
"Stopping node "<<nodes[
i] << endl;
605 int res = _restarter.restartOneDbNode(nodes[
i],
false,
true,
true);
606 CHECK(res == 0,
"Could not stop node: "<< nodes[i]);
609 CHECK(_restarter.waitNodesNoStart(nodes, numNodes) == 0,
614 int seconds = (myRandom48(max)) + 1;
615 g_info <<
"Waiting for " << seconds <<
"(" << max
616 <<
") secs " << endl;
617 NdbSleep_SecSleep(seconds);
621 CHECK(_restarter.restartAll() == 0,
622 "Could not restart all nodes");
624 g_info << _restart->m_name << endl;
630 NFDuringNR_codes[] = {
647 7195, 7196,7197,7198,7199,
661 int restartNFDuringNR(F_ARGS){
663 myRandom48Init((
long)NdbTick_CurrentMillisecond());
665 const int sz =
sizeof(NFDuringNR_codes)/
sizeof(NFDuringNR_codes[0]);
666 for(i = 0; i<sz; i++){
667 int randomId = myRandom48(_restarter.getNumDbNodes());
668 int nodeId = _restarter.getDbNodeId(randomId);
669 int error = NFDuringNR_codes[
i];
671 g_err << _restart->m_name <<
": node = " << nodeId
672 <<
" error code = " << error << endl;
674 CHECK(_restarter.restartOneDbNode(nodeId,
false,
true,
true) == 0,
675 "Could not restart node "<< nodeId);
677 CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
678 "waitNodesNoStart failed");
680 int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 } ;
681 CHECK(_restarter.dumpStateOneNode(nodeId, val, 2) == 0,
682 "failed to set RestartOnErrorInsert");
684 CHECK(_restarter.insertErrorInNode(nodeId, error) == 0,
685 "failed to set error insert");
687 CHECK(_restarter.startNodes(&nodeId, 1) == 0,
688 "failed to start node");
690 NdbSleep_SecSleep(3);
692 CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
693 "waitNodesNoStart failed");
695 CHECK(_restarter.startNodes(&nodeId, 1) == 0,
696 "failed to start node");
698 CHECK(_restarter.waitNodesStarted(&nodeId, 1) == 0,
699 "waitNodesStarted failed");
704 if(_restarter.getNumDbNodes() < 4)
708 if(NdbEnv_GetEnv(
"USER", buf, 256) == 0 || strcmp(buf,
"ejonore") != 0)
711 for(i = 0; i<sz && !ctx->isTestStopped(); i++){
712 const int randomId = myRandom48(_restarter.getNumDbNodes());
713 int nodeId = _restarter.getDbNodeId(randomId);
714 const int error = NFDuringNR_codes[
i];
716 const int masterNodeId = _restarter.getMasterNodeId();
717 CHECK(masterNodeId > 0,
"getMasterNodeId failed");
720 int rand = myRandom48(1000);
721 crashNodeId = _restarter.getRandomNodeOtherNodeGroup(nodeId, rand);
722 }
while(crashNodeId == masterNodeId);
724 CHECK(crashNodeId > 0,
"getMasterNodeId failed");
726 g_info << _restart->m_name <<
" restarting node = " << nodeId
727 <<
" error code = " << error
728 <<
" crash node = " << crashNodeId << endl;
730 CHECK(_restarter.restartOneDbNode(nodeId,
false,
true,
true) == 0,
731 "Could not restart node "<< nodeId);
733 CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
734 "waitNodesNoStart failed");
736 int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
737 CHECK(_restarter.dumpStateOneNode(crashNodeId, val, 2) == 0,
738 "failed to set RestartOnErrorInsert");
740 CHECK(_restarter.insertErrorInNode(crashNodeId, error) == 0,
741 "failed to set error insert");
743 CHECK(_restarter.startNodes(&nodeId, 1) == 0,
744 "failed to start node");
746 CHECK(_restarter.waitClusterStarted() == 0,
747 "waitClusterStarted failed");
754 NRDuringLCP_Master_codes[] = {
776 NRDuringLCP_NonMaster_codes[] = {
789 int restartNodeDuringLCP(F_ARGS) {
792 int val = DumpStateOrd::DihMinTimeBetweenLCP;
793 CHECK(_restarter.dumpStateAllNodes(&val, 1) == 0,
794 "Failed to set LCP to min value");
795 int sz =
sizeof(NRDuringLCP_Master_codes)/
796 sizeof(NRDuringLCP_Master_codes[0]);
797 for(i = 0; i<sz; i++) {
799 int error = NRDuringLCP_Master_codes[
i];
800 int masterNodeId = _restarter.getMasterNodeId();
802 CHECK(masterNodeId > 0,
"getMasterNodeId failed");
804 ndbout << _restart->m_name <<
" restarting master node = " << masterNodeId
805 <<
" error code = " << error << endl;
808 int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
809 CHECK(_restarter.dumpStateAllNodes(val, 2) == 0,
810 "failed to set RestartOnErrorInsert");
813 CHECK(_restarter.insertErrorInNode(masterNodeId, error) == 0,
814 "failed to set error insert");
816 CHECK(_restarter.waitNodesNoStart(&masterNodeId, 1, 300) == 0,
817 "failed to wait no start");
819 CHECK(_restarter.startNodes(&masterNodeId, 1) == 0,
820 "failed to start node");
822 CHECK(_restarter.waitClusterStarted(300) == 0,
823 "waitClusterStarted failed");
826 int val = DumpStateOrd::DihMinTimeBetweenLCP;
827 CHECK(_restarter.dumpStateOneNode(masterNodeId, &val, 1) == 0,
828 "failed to set error insert");
833 sz =
sizeof(NRDuringLCP_NonMaster_codes)/
834 sizeof(NRDuringLCP_NonMaster_codes[0]);
835 for(i = 0; i<sz; i++) {
837 int error = NRDuringLCP_NonMaster_codes[
i];
838 int nodeId = getRandomNodeId(_restarter);
839 int masterNodeId = _restarter.getMasterNodeId();
840 CHECK(masterNodeId > 0,
"getMasterNodeId failed");
842 while (nodeId == masterNodeId) {
843 nodeId = getRandomNodeId(_restarter);
846 ndbout << _restart->m_name <<
" restarting non-master node = " << nodeId
847 <<
" error code = " << error << endl;
849 int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
850 CHECK(_restarter.dumpStateAllNodes(val, 2) == 0,
851 "failed to set RestartOnErrorInsert");
853 CHECK(_restarter.insertErrorInNode(nodeId, error) == 0,
854 "failed to set error insert");
856 CHECK(_restarter.waitNodesNoStart(&nodeId, 1, 300) == 0,
857 "failed to wait no start");
859 CHECK(_restarter.startNodes(&nodeId, 1) == 0,
860 "failed to start node");
862 CHECK(_restarter.waitClusterStarted(300) == 0,
863 "waitClusterStarted failed");
866 int val = DumpStateOrd::DihMinTimeBetweenLCP;
867 CHECK(_restarter.dumpStateOneNode(nodeId, &val, 1) == 0,
868 "failed to set error insert");
875 int stopOnError(F_ARGS){
877 myRandom48Init((
long)NdbTick_CurrentMillisecond());
879 int randomId = myRandom48(_restarter.getNumDbNodes());
880 int nodeId = _restarter.getDbNodeId(randomId);
883 g_info << _restart->m_name <<
": node = " << nodeId
886 CHECK(_restarter.waitClusterStarted(300) == 0,
887 "waitClusterStarted failed");
889 int val = DumpStateOrd::NdbcntrTestStopOnError;
890 CHECK(_restarter.dumpStateOneNode(nodeId, &val, 1) == 0,
891 "failed to set NdbcntrTestStopOnError");
893 NdbSleep_SecSleep(3);
895 CHECK(_restarter.waitClusterStarted(300) == 0,
896 "waitClusterStarted failed");
903 myRandom48Init((
long)NdbTick_CurrentMillisecond());
904 int randomId = myRandom48(_restarter.getNumDbNodes());
905 int nodeId = _restarter.getDbNodeId(randomId);