forked from chriselswede/hanasitter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hanasitter.py
1323 lines (1253 loc) · 90.7 KB
/
hanasitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
from datetime import datetime
from threading import Timer
import sys, time, os, subprocess
from multiprocessing import Pool
import shutil
#import smtplib
#from email.mime.multipart import MIMEMultipart
#from email.mime.text import MIMEText
def printHelp():
print(" ")
print("DESCRIPTION: ")
print(" The HANA sitter checks regularly (def. 1h) if HANA is online and primary. If so, it starts to track. Tracking includes ")
print(" regularly (def. 1m) checks if HANA is responsive. If it is not, it will record. Recording could include writing call stacks ")
print(" of all active threads and/or record run time dumps and/or indexserver gstacks and/or kernel profiler traces. By default ")
print(" nothing is recorded. If HANA is responsive it will check for too many critical features of HANA. By default this is checking ")
print(" if there are more than 30 active threads. If there is, it will record (see above). After it is done recording it will by ")
print(" default exit, but could also restart, if so wanted. ")
print(" After it has concluded that all was good, it will wait (def. 1h) and then start all over to check again if HANA is online ")
print(" and primary. See also SAP Note 2399979. ")
print(" ")
print("PREREQUISITES: ")
print(" - Executed by <sid>adm ")
print(" - A DB user with proper privileges maintained in hdbuserstore (to be used with the -k flag) ")
print(' - In case you run HANASitter on a virtual host the linux command "hostname" have to return the logical host name (not the physical) ')
print(" - Should be executed on (one of) the host(s) on which (one of) the HANA node(s) is running ")
print(""" - /bin/bash has to return "clean" outputs, e.g. /bin/bash -i -c "alias cdhdb" should ONLY return something like alias cdhdb='cd $DIR_INSTANCE' """)
print(" ")
print("INPUT ARGUMENTS: ")
print(" *** CHECKS (Pings and/or Feature Checks and/or CPU Checks) *** ")
print(" -oi online test interval [seconds], time it waits before it checks if DB is online again, default: 3600 seconds ")
print(" -cpu a 4 items list to control the cpu check: cpu type, number checks, interval, max average CPU in %, default: 0,0,0,100 ")
print(" Possible cpu types are: 0 = not used, 1 = user cpu, 2 = system cpu ")
print(" -pt ping timeout [seconds], time it waits before the DB is considered unresponsive, default: 60 seconds ")
print(' -cf list of features surrounded by two "s; the -cf flag has two modes, 1. One Column Mode and 2. Where Clause Mode ')
print(" 1. One Column Mode: any sys.m_* view, a column in that view, the column value (wildcards, *, before and/or after are possible) and ")
print(" max number allowed feature occations, i.e. ")
print(' "<m_view 1>,<feature 1>,<[*]value 1[*]>,<limit 1>,...,<m_view N>,<feature N>,<[*]value N[*]>,<limit N>" ')
print(" 2. Where Clause Mode: any sys.m_* view, the keyword 'WHERE', the where clause and max number allowed feature occations, i.e. ")
print(' "<m_view 1>,WHERE,<where clause 1>,<limit 1>,...,<m_view N>,WHERE,<where clause N>,<limit N>" ')
print(' default: "" ')
print(' Note: <limit> should be an integer, or an integer preceded by < (for maximum allowed) or > (for minumum allowed) ')
print(" -if number checks and intervals of checks, every odd item of this list specifies how many times each feature check (see -cf) should be executed")
print(" and every even item specifies how many seconds it waits between each check, then the <max numbers X> in the -cf flag is the maximum ")
print(" allowed average value, e.g. <number checks 1>,<interval [s] 1>,...,<number checks N>,<interval [s] N>, default: [] (not used) ")
print(" -tf feature check time out [seconds], time it waits before the DB is considered unresponsive during a feature check ")
print(" (see -cf), if -if is used this time out will be added to the interval and then multiplied with number checks, default: 60 seconds ")
print(" -lf log features [true/false], logging ALL information of ALL critical features (beware: could be costly!), default: false ")
print(" -ci check interval [seconds], time it waits before it checks cpu, pings and check features again, default: 60 seconds ")
print(" -ar time to sleep after recording [seconds], if negative it exits, default: -1 ")
print(" *** RECORDINGS (GStacks and/or Kernel Profiler Traces and/or Call Stacks and/or RTE dumps) *** ")
print(" -rm recording mode [1, 2 or 3], 1 = each requested recording types are done one after each other with the order above, ")
print(" e.g. GStack 1, GStack 2, ..., GStack N, RTE 1, RTE 2, ..., RTE N (this is default) ")
print(" 2 = the recordings of each requested recording types are done after each other with the ")
print(" order above, e.g. GStack 1, RTE 1, Gstack 2, RTE 2, ... ")
print(" 3 = different recording types recorded in parallel threads, e.g. if 2 GStacks and 1 RTE ")
print(" requested then GStack 1 and RTE 1 are parallel done, when both done GStack 2 starts ")
print(" -rp recording priorities [list of 4 integers [1,4]] defines what order the recording modes will be executed for rm = 1 and rm = 2 ")
print(" # 1 = RTE, # 2 = CallStacks, # 3 = GStacks, # 4 = Kernel Profiler, default: 1,2,3,4 ")
print(" -ng number indexserver gstacks created if the DB is considered unresponsive (Note: gstack blocks the indexserver! See SAP Note 2000000 ")
print(' "Call stack generation via gstack"), default: 0 (not used) ')
print(" -ig gstacks interval [seconds], for -rm = 1: time it waits between each gstack, ")
print(" for -rm = 2: time it waits after a gstack, ")
print(" for -rm = 3: time the thread waits after a gstack, default: 60 seconds ")
print(" -np number indexserver kernel profiler traces created if the DB is considered unresponsive: default: 0 (not used) ")
print(" -dp profiler duration [seconds], how long time it is tracing, default: 60 seconds (more info: SAP Note 1804811) ")
print(" -wp profiler wait time [milliseconds], wait time after callstacks of all running threads have been taken, default 0 ")
print(" -ip profiler interval [seconds], for -rm = 1: time it waits between each profiler trace, ")
print(" for -rm = 2: time it waits after a profiler trace, ")
print(" for -rm = 3: time the thread waits after a profiler trace, default: 60 seconds ")
print(" -nc number call stacks created if the DB is considered unresponsive: default: 0 (not used) ")
print(" -ic call stacks interval [seconds], for -rm = 1: time it waits between each call stack, ")
print(" for -rm = 2: time it waits after a call stack, ")
print(" for -rm = 3: time the thread waits after a call stack, default: 60 seconds ")
print(" -nr number rte dumps created if the DB is considered unresponsive: default: 0 (not used) ")
print(" Note: output is restricted to these folders /tmp, $HOME, $DIR_INSTANCE/work, and $SAP_RETRIEVAL_PATH ")
print(" -ir rte dumps interval [seconds], for -rm = 1: time it waits between each rte dump, ")
print(" for -rm = 2: time it waits after an rte dump, ")
print(" for -rm = 3: time the thread waits after an rte dump, default: 60 seconds ")
print(" -mr rte dump mode [0 or 1], -mr = 0: normal rte dump, ")
print(" -mr = 1: light rte dump mode, only rte dump with STACK_SHORT and THREADS sections, and some M_ views, default: 0 ")
print(" *** KILL SESSIONS (use with care!) *** ")
print(" -ks kill session [list of true/false], list of booleans (length must be the same as number of features defined by -cf) that defines if -cf's ")
print(" features could indicate that the sessions (connections) are tried to be disconnected or not, default: None (not used) ")
print(" Note: Requires SESSION ADMIN ")
print(" *** ADMINS (Output Directory, Logging, Output and DB User) *** ")
print(" -od output directory, full path of the folder where all output files will end up (if the folder does not exist it will be created), ")
print(" default: '/tmp/hanasitter_output' ")
print(" -or output log retention days, hanasitterlogs in the path specified with -od are only saved for this number of days, default: -1 (not used) ")
print(" -en email notification, <sender's email>,<reciever's email>,<mail server>, default: (not used) ")
print(" example: [email protected],[email protected],smtp.intra.ourcompany.com ")
print(' NOTE: For this to work you have to install the linux program "sendmail" and add a line similar to DSsmtp.intra.ourcompany.com in the file ')
print(" sendmail.cf in /etc/mail/, see https://www.systutorials.com/5167/sending-email-using-mailx-in-linux-through-internal-smtp/ ")
print(" -so standard out switch [true/false], switch to write to standard out, default: true ")
print(" -ff flag file, full path to a file that contains input flags, each flag in a new line, all lines in the file that does not start with a ")
print(" flag are considered comments, if this flag is used no other flags should be given, default: '' (not used) ")
print(" -ssl turns on ssl certificate [true/false], makes it possible to use SAP HANA Sitter despite SSL, default: false ")
print(" -vlh virtual local host, if hanacleaner runs on a virtual host this has to be specified, default: '' (physical host is assumed) ")
print(" -k DB user key, this one has to be maintained in hdbuserstore, i.e. as <sid>adm do ")
print(" > hdbuserstore SET <DB USER KEY> <ENV> <USERNAME> <PASSWORD> , default: SYSTEMKEY ")
print(" ")
print(" ")
print("EXAMPLE (if > 20 THREAD_STATE=Running, or > 30 THREAD_STATE=Semaphore Wait are found 2 RTE dumps and 3 GStacks will be recorded ")
print(" in parallel, i.e. RTE1&GStack1, RTE2&GStack2, GStack3): ")
print(' > python hanasitter.py -cf "M_SERVICE_THREADS,THREAD_STATE,Running,30,M_SERVICE_THREADS,THREAD_STATE,Semaphore Wait,20" -nr 2 -ng 3 -rm 3 ')
print(" ")
print("EXAMPLE (if, on average from 3 checks with 5s interval, > 30 THREAD_STATE=Running, or if any column from the table VARINUM has been unloaded, ")
print(" then record two call stacks) ")
print(' > python hanasitter.py -cf "M_SERVICE_THREADS,THREAD_STATE,Running,30,M_CS_UNLOADS,TABLE_NAME,VARINUM,1" -if 3,5,1,0 -nc 2 ')
print(" ")
print("EXAMPLE (Here a where clause is given) ")
print(''' > python hanasitter.py -cf "M_SERVICE_THREADS,WHERE,IS_ACTIVE='TRUE' and SERVICE_NAME='indexserver' and DURATION>420000000,1" -nc 2 ''')
print(" ")
print("EXAMPLE (if average system CPU >95% or Ping > 30 seconds, 2 Call Stacks are recorded, or else it will try again after 120 seconds, after ")
print(" recording it will sleep for one hour before it starts to track again): ")
print(" > python hanasitter.py -cpu 2,5,2,95 -pt 30 -ci 120 -nc 2 -ar 3600 ")
print(" ")
print("EXAMPLE (if there are more then 10 threads from the Application user AUSER123 or from the DB user DUSER123 record 2 RTE dumps): ")
print(' > python hanasitter.py -cf "M_SERVICE_THREADS,APPLICATION_USER_NAME,AUSER123,10,M_SERVICE_THREADS,USER_NAME,DUSER123,10" -nr 2 ')
print(" ")
print("EXAMPLE (if there are more then 5 threads with a thread method that starts with PlanExecutor or with a thread type that ")
print(" includes Attribute or that is executed from any user starting with DUSER12, then 5 GStacks are recorded ")
print(' > python hanasitter.py -cf "M_SERVICE_THREADS,THREAD_METHOD,PlanExecutor*,5,M_SERVICE_THREADS,THREAD_TYPE,*Attribute*,5,M_SERVICE_THREADS,USER_NAME,DUSER12*,5" -ng 5 ')
print(" ")
print("EXAMPLE (reads a configuration file, but one flag will overwrite what is in the configuration file, i.e. there will be 3 callstacks instead of 2): ")
print(" > python hanasitter.py -ff /tmp/HANASitter/hanasitter_configfile.txt -nc 3 ")
print(" Where the config file could looks like this: ")
print(" MY HANASITTER CONFIGURATION FILE ")
print(" If more than 20 threads is in state TREAD_STATE=Running ")
print(' -cf "M_SERVICE_THREADS,THREAD_STATE,Running,20" ')
print(" then 2 call stacks ")
print(" -nc 2 ")
print(" with 30 seconds between them ")
print(" -ic 30 ")
print(" are recorded. This is the key in hdbuserstore that is used: ")
print(" -k SYSTEMKEY ")
print(" ")
print("CURRENT KNOWN LIMITATIONS (i.e. TODO LIST): ")
print(" 1. Record in parallel for different Scale-Out Nodes (should work for some recording types, e.g. RTE dumps --> TODO) ")
print(" 2. If a CPU only happens on one Host, possible to record on only one Host - Possible to detect if a feature only happens on one Host, then record ")
print(" only on that Host ")
print(" 3. CPU should be possible to be checked for BOTH system AND user --> TODO ")
print(" 4. Let HANASitter first check that there is no other hanasitter process running --> refuse to run --> TODO ")
print(" ")
print("AUTHOR: Christian Hansen ")
print(" ")
print(" ")
os._exit(1)
def printDisclaimer():
print(" ")
print("ANY USAGE OF HANASITTER ASSUMES THAT YOU HAVE UNDERSTOOD AND AGREED THAT: ")
print(" 1. HANASitter is NOT SAP official software, so normal SAP support of HANASitter cannot be assumed ")
print(" 2. HANASitter is open source ")
print(' 3. HANASitter is provided "as is" ')
print(' 4. HANASitter is to be used on "your own risk" ')
print(" 5. HANASitter is a one-man's hobby (developed, maintained and supported only during non-working hours) ")
print(" 6 All HANASitter documentations have to be read and understood before any usage: ")
print(" a) SAP Note 2399979 ")
print(" b) The .pdf file that can be downloaded at the bottom of SAP Note 2399979 ")
print(" c) All output from executing ")
print(" python hanasitter.py --help ")
print(" 7. HANASitter can help you to automize certain monitoring tasks but is not an attempt to teach you how to monitor SAP HANA ")
print(" I.e. if you do not know what you want to do, HANASitter cannot help, but if you do know, HANASitter can automitize it ")
print(" 8. HANASitter is not providing any recommendations, all flags shown in the documentation (see point 6.) are only examples ")
os._exit(1)
############ GLOBAL VARIABLES ##############
emailNotification = None
######################## DEFINE CLASSES ##################################
class RTESetting:
def __init__(self, num_rtedumps, rtedumps_interval):
self.num_rtedumps = num_rtedumps
self.rtedumps_interval = rtedumps_interval
class CallStackSetting:
def __init__(self, num_callstacks, callstacks_interval):
self.num_callstacks = num_callstacks
self.callstacks_interval = callstacks_interval
class GStackSetting:
def __init__(self, num_gstacks, gstacks_interval):
self.num_gstacks = num_gstacks
self.gstacks_interval = gstacks_interval
class KernelProfileSetting:
def __init__(self, num_kprofs, kprofs_interval, kprofs_duration, kprofs_wait):
self.num_kprofs = num_kprofs
self.kprofs_interval = kprofs_interval
self.kprofs_duration = kprofs_duration
self.kprofs_wait = kprofs_wait
class EmailNotification:
def __init__(self, senderEmail, recieverEmail, mailServer):
self.senderEmail = senderEmail
#self.senderPassword = senderPassword
self.recieverEmail = recieverEmail
self.mailServer = mailServer
#self.mailServerPort = mailServerPort
#self.timeout = timeout
#self.SID = SID
def printEmailNotification(self):
print "Sender Email: ", self.senderEmail, " Reciever Email: ", self.recieverEmail, " Mail Server: ", self.mailServer
#### Remember:
#Nameserver port is always 3**01 and SQL port = 3**13 valid for,
# - System DB in MDC
#
#If indexserver port = 3**03 then SQL port = 3**15 valid for,
# - Single container in SAP HANA 1.0 and
# - Default tenant starting SAP HANA 2.0 SPS2
#
#If indexserver port ≥ 3**40 then SQL port is always indexserver port +1, valid for
# - All MDC tenants until HANA 2.0 SPS1 and
# - Starting HANA 2 SPS2 with second tenant within a MDC system
class Tenant:
def __init__(self, DBName, indexserverPort, instanceNbr, SID):
self.DBName = DBName
self.indexserverPort = int(indexserverPort)
self.instanceNbr = instanceNbr
self.SID = SID
if self.indexserverPort >= int("3"+self.instanceNbr+"40"):
self.sqlPort = self.indexserverPort + 1
elif self.indexserverPort == int("3"+self.instanceNbr+"03"):
self.sqlPort = int("3"+self.instanceNbr+"15")
else:
print "ERROR, something went wrong, indexserver port is not according to the rules; "+str(self.indexserverPort)
os._exit(1)
def printTenant(self):
print "TenantDB: ", self.DBName, " Indexserver Port: ", self.indexserverPort, " Sql Port: ", self.sqlPort
def getIndexserverPortString(self):
return str(self.indexserverPort)
class HDBCONS:
def __init__(self, local_host, hosts, local_dbinstance, is_mdc, is_tenant, communicationPort, SID, rte_mode, tenantDBName = None):
self.local_host = local_host
self.local_dbinstance = local_dbinstance
self.hosts = hosts
self.is_scale_out = (len(hosts) > 1)
self.is_mdc = is_mdc
self.is_tenant = is_tenant
self.communicationPort = communicationPort
self.SID = SID
self.tenantDBName = tenantDBName
self.rte_mode = rte_mode
self.temp_host_output_dirs = []
# SET HDBCONS STRINGS
self.hdbcons_strings = []
for host in self.hosts:
if not self.is_mdc: # not MDC
if not self.is_scale_out:
self.hdbcons_strings.append('hdbcons "')
else:
self.hdbcons_strings.append('hdbcons "distribute exec '+host+':'+self.communicationPort+' ') # SAP Note 2222218
else: # MDC (both SystemDB and Tenant)
self.hdbcons_strings.append('hdbcons -e hdbnameserver "distribute exec '+host+':'+self.communicationPort+' ') # SAP Notes 2222218 and 2410143
def create_temp_output_directories(self): # CREATE TEMPORARY OUTPUT DIRECTORIES and SET PRIVILEGES (CHMOD)
cdtrace_path_local = cdalias('cdtrace', self.local_dbinstance)
if not self.local_host in cdtrace_path_local:
print "ERROR, local host, ", self.local_host, ", is not part of cdtrace, ", cdtrace_path_local
os._exit(1)
for host in self.hosts:
self.temp_host_output_dirs.append(cdtrace_path_local.replace(self.local_host, host)+"hanasitter_temp_out_"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+"/")
for path in self.temp_host_output_dirs:
subprocess.check_output("mkdir "+path, shell=True)
subprocess.check_output("chmod 777 "+path, shell=True)
def clear(self):
for path in self.temp_host_output_dirs:
if os.path.isdir(path):
subprocess.check_output("rm -r "+path, shell=True)
class CommunicationManager:
def __init__(self, dbuserkey, out_dir, std_out, hdbsql_string, log_features):
self.dbuserkey = dbuserkey
self.out_dir = out_dir
self.std_out = std_out
self.hdbsql_string = hdbsql_string
self.log_features = log_features
class CriticalFeature:
def __init__(self, view, feature, value, limit, killSession = False):
self.view = view
self.feature = feature
self.maxRepeat = None
self.whereMode = (self.feature == 'WHERE')
if self.whereMode:
self.whereClause = value
else:
# IF THERE IS A > THEN TRY TO SPLIT TO A MAX_REPEAT AND A VALUE
if '>' in value: # to find string before > X number times where X is the integer after >
self.maxRepeat = value.rsplit('>',1)[1] #rsplit allows other >s in the value
if is_integer(self.maxRepeat): #if not, then this > was not intended for repeat
value = value.rsplit('>',1)[0] #where-clause to find rows where the column 'feature' contains the string 'value' more than 'maxRepeat' times
self.whereClause = "length("+feature+") - length(replace("+feature+", '"+value+"', '')) > "+str(int(self.maxRepeat)*len(value))
# IF NOT MANAGED TO SPLIT THEN FIRST CORRECT WILDCARDS AND THEN CREATE THE WHERE CLAUSE
if not is_integer(self.maxRepeat):
if value[0] == '*' and value[-1] == '*': #wildcards, "*", before and after
value = "'%"+value[1:-1]+"%'"
elif value[0] == '*': #wildcard, "*", before
value = "'%"+value[1:]+"'"
elif value[-1] == '*': #wildcard, "*", after
value = "'"+value[:-1]+"%'"
else:
value = "'"+value+"'"
if value[1] == '%' or value[-1] == '%':
self.whereClause = feature + " like " + value #where-clause with wildcard(s)
else:
self.whereClause = feature + " = " + value #where-clause without wildcard(s)
self.value = value
self.limitIsMinimumNumberCFAllowed = (limit[0] == '>') # so default and < then maximum number CF allowed
if limit[0] in ['<', '>']:
limit = limit[1:]
if not is_integer(limit):
print "INPUT ERROR: 4th item of -cf must be either an integer or an integer preceded by < or >. Please see --help for more information."
os._exit(1)
self.limit = int(limit)
self.killSession = killSession
self.whereClauseDescription = self.whereClause
if is_integer(self.maxRepeat):
self.whereClauseDescription = "column "+self.feature+" in "+self.view+" contains the string "+self.value+" more than "+self.maxRepeat+" times"
self.nbrIterations = 1
self.interval = 0 #[s]
def setKillSession(self, killSession):
self.killSession = killSession
def setIterations(self, iterations, interval):
self.nbrIterations = iterations
self.interval = interval
######################## DEFINE FUNCTIONS ################################
def is_integer(s):
if s == None:
return False
try:
int(s)
return True
except ValueError:
return False
def is_email(s):
s = s.split('@')
if not len(s) == 2:
return False
return '.' in s[1]
def checkAndConvertBooleanFlag(boolean, flagstring):
boolean = boolean.lower()
if boolean not in ("false", "true"):
print "INPUT ERROR: ", flagstring, " must be either 'true' or 'false'. Please see --help for more information."
os._exit(1)
boolean = True if boolean == "true" else False
return boolean
def is_online(dbinstance, comman):
process = subprocess.Popen(['sapcontrol', '-nr', dbinstance, '-function', 'GetProcessList'], stdout=subprocess.PIPE)
out, err = process.communicate()
number_services = out.count(" HDB ")
number_running_services = out.count("GREEN")
test_ok = (str(err) == "None")
result = number_running_services == number_services
printout = "Online Check , "+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+" , - , "+str(test_ok)+" , "+str(result)+" , Number running services: "+str(number_running_services)+" out of "+str(number_services)
log(printout, comman)
return result
def is_secondary(comman):
process = subprocess.Popen(['hdbnsutil', '-sr_state'], stdout=subprocess.PIPE)
out, err = process.communicate()
test_ok = (str(err) == "None")
result = "active primary site" in out # then it is secondary!
printout = "Primary Check , "+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+" , - , "+str(test_ok)+" , "+str(not result)+" , "
log(printout, comman)
return result
def ping_db(comman, output):
with open(os.devnull, 'w') as devnull: # just to get no stdout in case HANA is offline
try:
output[0] = subprocess.check_output(comman.hdbsql_string+''' -j -A -U '''+comman.dbuserkey+''' "select * from dummy"''', shell=True, stderr=devnull)
except:
pass
def hana_ping(ping_timeout, comman):
pause = ping_timeout/10.
lifetime = 0
pinged = False
hanging = False
offline = False
while not pinged and not hanging and not offline:
output = [None]
t = Timer(0.1,ping_db,[comman, output]) # Will not return if HANA is in a hanging situation, if HANA is offline it will return immediately with output[0] still Null
t.start()
t.join(ping_timeout)
hanging = t.is_alive()
if output[0]:
pinged = output[0].splitlines(1)[2].replace('|','').replace(' ','').replace('\n','') == 'X'
if hanging and pinged:
print "ERROR, it cannot be both pinged and hanging"
os._exit(1)
if not pinged and not hanging: # then still investigating if offline
offline = lifetime > ping_timeout
if not offline:
time.sleep(pause) # e.g. if ping timeout is 60 seconds it will retry after 6 seconds if HANA is offline
lifetime += pause
return [hanging, offline]
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
def prio_def(prio_number):
prios = {1:"RTE", 2:"Call Stacks", 3:"G-Stacks", 4:"Kernel Profiler"}
return prios[prio_number]
def recording_prio_convert(recording_prio):
recordings = []
for rec in recording_prio:
recordings.append(prio_def(rec))
return " ".join(recordings)
def file_lines_with_word(file_name, word):
lines = []
with open(file_name) as f:
for line in f:
if word in line:
lines.append(line)
return lines
def clean_logs(minRetainedLogDays, comman):
path = comman.out_dir
nFilesBefore = len([name for name in os.listdir(path) if "hanasitterlog" in name])
subprocess.check_output("find "+path+"/hanasitterlog* -mtime +"+str(minRetainedLogDays)+" -delete", shell=True)
nFilesAfter = len([name for name in os.listdir(path) if "hanasitterlog" in name])
return nFilesBefore - nFilesAfter
def tenant_names_and_ports(daemon_file):
tenantDBNames = []
tenantIndexserverPorts = []
ports_first_halfs = []
ports_second_halfs = []
foundNewName = False
foundFirstPortHalf = False
with open(daemon_file) as f:
for line in f:
if not foundNewName and "[indexserver." in line:
tenantDBNames.append(line.strip("[indexserver.").strip("\n").strip("]"))
foundNewName = True
elif foundNewName and not foundFirstPortHalf and "arguments = -port " in line:
ports_first_halfs.append(line.strip("arguments = -port ").split("$")[0])
foundFirstPortHalf = True
elif foundNewName and foundFirstPortHalf and "instanceids = " in line:
ports_second_halfs.append(line.strip("instanceids = ").strip("\n"))
foundNewName = False
foundFirstPortHalf = False
tenantIndexserverPorts = [first+second for first, second in zip(ports_first_halfs, ports_second_halfs)]
return [tenantDBNames, tenantIndexserverPorts]
def cpu_too_high(cpu_check_params, comman):
if int(cpu_check_params[0]) == 0 or int(cpu_check_params[1]) == 0 or int(cpu_check_params[3]) == 100: # if CPU type is 0 or if number CPU checks is 0 or allowed CPU is 100 then no CPU check
return False
start_time = datetime.now()
command_run = subprocess.check_output("sar -u "+cpu_check_params[1]+" "+cpu_check_params[2], shell=True)
sar_words = command_run.split()
cpu_column = 2 if int(cpu_check_params[0]) == 1 else 4
current_cpu = sar_words[sar_words.index('Average:') + cpu_column]
if not is_number(current_cpu):
print "ERROR, something went wrong while using sar. Output = "
print command_run
os._exit(1)
too_high_cpu = float(current_cpu) > int(cpu_check_params[3])
stop_time = datetime.now()
cpu_string = "User CPU Check " if int(cpu_check_params[0]) == 1 else "System CPU Check"
printout = cpu_string+" , "+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+" , "+str(stop_time-start_time)+" , True , "+str(not too_high_cpu)+" , Av. CPU = "+current_cpu+" % (Allowed = "+cpu_check_params[3]+" %) "
log(printout, comman, sendEmail = too_high_cpu)
return too_high_cpu
def stop_session(cf, comman):
heather = subprocess.check_output(comman.hdbsql_string+' -j -A -U '+comman.dbuserkey+' "select top 1 * from SYS.'+cf.view+' where '+cf.whereClause+'"', shell=True).splitlines(1)
heather = heather[0].strip('\n').strip(' ').split('|')
heather = [h.strip(' ') for h in heather if h != '']
if 'CONNECTION_ID' in heather:
connIds = subprocess.check_output(comman.hdbsql_string+' -j -A -a -x -U '+comman.dbuserkey+' "select distinct CONNECTION_ID from SYS.'+cf.view+' where '+cf.whereClause+'"', shell=True).splitlines(1)
connIds = [c.strip('\n').strip('|').strip(' ') for c in connIds]
for connId in connIds:
printout = "Will disconnect session "+connId+" due to the check: "+cf.whereClauseDescription
log(printout, comman)
subprocess.check_output(comman.hdbsql_string+""" -j -A -U """+comman.dbuserkey+""" "ALTER SYSTEM DISCONNECT SESSION '"""+connId+"""'" """, shell=True)
def feature_check(cf, nbrCriticalFeatures, critical_feature_info, comman): # cf = critical_feature, # comman = communication manager
#CHECKS
viewExists = int(subprocess.check_output(comman.hdbsql_string+" -j -A -a -x -Q -U "+comman.dbuserkey+" \"select count(*) from sys.m_monitors where view_name = '"+cf.view+"'\"", shell=True).strip(' '))
if not viewExists:
log("INPUT ERROR, the view given as first entry in the -cf flag, ", cf.view, ", does not exist. Please see --help for more information.", comman)
os._exit(1)
if not cf.whereMode:
columnExists = int(subprocess.check_output(comman.hdbsql_string+" -j -A -a -x -Q -U "+comman.dbuserkey+" \"select count(*) from sys.m_monitor_columns where view_name = '"+cf.view+"' and view_column_name = '"+cf.feature+"'\"", shell=True).strip(' '))
if not columnExists:
log("INPUT ERROR, the view ", cf.view, " does not have the column ", cf.feature, ". Please see --help for more information.", comman)
os._exit(1)
nbrCFSum = 0
for iteration in range(cf.nbrIterations):
# EXECUTE
command_run = subprocess.check_output(comman.hdbsql_string+' -j -A -U '+comman.dbuserkey+' "select count(*) from SYS.'+cf.view+' where '+cf.whereClause+'"', shell=True)
# COLLECT INFO
if comman.log_features:
critical_feature_info[0] = subprocess.check_output(comman.hdbsql_string+' -j -A -U '+comman.dbuserkey+' "select * from SYS.'+cf.view+' where '+cf.whereClause+'"', shell=True)
# COUNT CRITICAL FEATURES
nbrCFs = -1
try:
nbrCFs = int(command_run.split('|')[5].replace(" ", ""))
except:
log("ERROR, a table was not retrieved. command_run = \n", command_run, "\tPOSSIBLE CONNECTION ERROR: Please check that the key is maintained in hdbuserstore", comman)
os._exit(1)
if nbrCFs < 0:
log("ERROR, something went wrong. command_run = \n", command_run, "\tPOSSIBLE CONNECTION ERROR: Please check that the key is maintained in hdbuserstore", comman)
os._exit(1)
nbrCFSum += nbrCFs
# CRITICAL FEATURE CHECK INTERVALL
time.sleep(float(cf.interval))
# GET AVERAGE
nbrCriticalFeatures[0] = int( float(nbrCFSum) / float(cf.nbrIterations) )
def record_gstack(gstacks_interval, comman):
pid = subprocess.check_output("pgrep hdbindexserver", shell=True).strip("\n").strip(" ")
start_time = datetime.now()
filename = (comman.out_dir.replace(".","_")+"/gstack_"+pid+"_"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+".txt")
os.system('gstack '+pid+' > '+filename)
stop_time = datetime.now()
printout = "GStack Record , "+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+" , "+str(stop_time-start_time)+" , - , - , "+filename
log(printout, comman)
time.sleep(gstacks_interval)
return printout
def record_kprof(kprofiler, hdbcons, comman): # SAP Note 1804811
out_dir = comman.out_dir.replace(".","_")+"/"
total_printout = ""
for hdbcon_string, host, tmp_dir in zip(hdbcons.hdbcons_strings, hdbcons.hosts, hdbcons.temp_host_output_dirs):
tenantDBString = hdbcons.tenantDBName+"_" if hdbcons.is_tenant else ""
filename_cpu = ("kernel_profiler_cpu_"+host+"_"+hdbcons.SID+"_"+tenantDBString+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+".dot")
filename_wait = ("kernel_profiler_wait_"+host+"_"+hdbcons.SID+"_"+tenantDBString+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+".dot")
filename_kprof_log = ("kernel_profiler_output_"+host+"_"+hdbcons.SID+"_"+tenantDBString+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+".log")
start_time = datetime.now()
os.system(hdbcon_string+'profiler clear" > '+out_dir+filename_kprof_log)
os.system(hdbcon_string+'profiler start -w '+str(kprofiler.kprofs_wait)+'" > '+out_dir+filename_kprof_log)
time.sleep(kprofiler.kprofs_duration)
os.system(hdbcon_string+'profiler stop" > '+out_dir+filename_kprof_log)
os.system(hdbcon_string+'profiler print -o '+tmp_dir+filename_cpu+','+tmp_dir+filename_wait+'" > '+out_dir+filename_kprof_log)
stop_time = datetime.now()
if "[ERROR]" in open(out_dir+filename_kprof_log).read():
printout = "Kernel Profiler , "+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+" , "+str(stop_time-start_time)+" , False , None , "+out_dir+filename_kprof_log
else:
os.system("mv "+tmp_dir+filename_cpu+" "+out_dir+filename_cpu)
os.system("mv "+tmp_dir+filename_wait+" "+out_dir+filename_wait)
printout = "Kernel Profiler , "+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+" , "+str(stop_time-start_time)+" , True , - , "+out_dir+filename_cpu+" and "+out_dir+filename_wait
log(printout, comman)
total_printout += printout
time.sleep(kprofiler.kprofs_interval)
return total_printout
def record_callstack(callstacks_interval, hdbcons, comman):
total_printout = ""
for hdbcon_string, host in zip(hdbcons.hdbcons_strings, hdbcons.hosts):
tenantDBString = hdbcons.tenantDBName+"_" if hdbcons.is_tenant else ""
filename = (comman.out_dir.replace(".","_")+"/callstack_"+host+"_"+hdbcons.SID+"_"+tenantDBString+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+".txt")
start_time = datetime.now()
os.system(hdbcon_string+'context list -s" > '+filename)
stop_time = datetime.now()
printout = "Call Stack Record , "+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+" , "+str(stop_time-start_time)+" , - , - , "+filename
log(printout, comman)
total_printout += printout
time.sleep(callstacks_interval)
return total_printout
def record_rtedump(rtedumps_interval, hdbcons, comman):
total_printout = ""
for hdbcon_string, host in zip(hdbcons.hdbcons_strings, hdbcons.hosts):
tenantDBString = hdbcons.tenantDBName+"_" if hdbcons.is_tenant else ""
start_time = datetime.now()
if hdbcons.rte_mode == 0: # normal rte dump
filename = (comman.out_dir.replace(".","_")+"/rtedump_normal_"+host+"_"+hdbcons.SID+"_"+tenantDBString+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+".trc")
os.system(hdbcon_string+'runtimedump dump -c" > '+filename) # have to dump to std with -c and then to a file with > since in case of scale-out -f does not work
elif hdbcons.rte_mode == 1: # light rte dump
filename = (comman.out_dir.replace(".","_")+"/rtedump_light_"+host+"_"+hdbcons.SID+"_"+tenantDBString+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+".trc")
os.system(hdbcon_string+'runtimedump dump -c -s STACK_SHORT,THREADS" > '+filename)
os.system(hdbcon_string+'statreg print -h -n M_JOBEXECUTORS_" >> '+filename)
os.system(hdbcon_string+'statreg print -h -n M_DEV_JOBEX_THREADGROUPS" >> '+filename)
os.system(hdbcon_string+'statreg print -h -n M_DEV_JOBEXWAITING" >> '+filename)
os.system(hdbcon_string+'statreg print -h -n M_DEV_CONTEXTS" >> '+filename)
os.system(hdbcon_string+'statreg print -h -n M_CONNECTIONS" >> '+filename)
os.system(hdbcon_string+'statreg print -h -n M_DEV_SESSION_PARTITIONS" >> '+filename)
stop_time = datetime.now()
printout = "RTE Dump Record , "+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+" , "+str(stop_time-start_time)+" , True , - , "+filename # if an [ERROR] happens that will be inside the file, hanasitter will not know it
log(printout, comman)
total_printout += printout
time.sleep(rtedumps_interval)
return total_printout
def record(recording_mode, rte, callstack, gstack, kprofiler, recording_prio, hdbcons, comman):
if recording_mode == 1:
for p in recording_prio:
if p == 1:
for i in range(rte.num_rtedumps):
record_rtedump(rte.rtedumps_interval, hdbcons, comman)
if p == 2:
for i in range(callstack.num_callstacks):
record_callstack(callstack.callstacks_interval, hdbcons, comman)
if p == 3:
for i in range(gstack.num_gstacks):
record_gstack(gstack.gstacks_interval, comman)
if p == 4:
for i in range(kprofiler.num_kprofs):
record_kprof(kprofiler, hdbcons, comman)
elif recording_mode == 2:
max_nbr_recordings = max(gstack.num_gstacks, kprofiler.num_kprofs, callstack.num_callstacks, rte.num_rtedumps)
for i in range(max_nbr_recordings):
for p in recording_prio:
if p == 1:
if i < rte.num_rtedumps:
record_rtedump(rte.rtedumps_interval, hdbcons, comman)
if p == 2:
if i < callstack.num_callstacks:
record_callstack(callstack.callstacks_interval, hdbcons, comman)
if p == 3:
if i < gstack.num_gstacks:
record_gstack(gstack.gstacks_interval, comman)
if p == 4:
if i < kprofiler.num_kprofs:
record_kprof(kprofiler, hdbcons, comman)
else:
record_in_parallel(rte, callstack, gstack, kprofiler, hdbcons, comman)
return True
def record_in_parallel(rte, callstack, gstack, kprofiler, hdbcons, comman):
max_nbr_recordings = max(gstack.num_gstacks, kprofiler.num_kprofs, callstack.num_callstacks, rte.num_rtedumps)
for i in range(max_nbr_recordings):
nbr_recording_types = sum(x > i for x in [rte.num_rtedumps, callstack.num_callstacks, gstack.num_gstacks, kprofiler.num_kprofs])
pool = Pool(nbr_recording_types) # need as many threads as number of recording types
rec_types = []
if rte.num_rtedumps > i:
rec_types.append((1, rte, hdbcons, comman)) # 1 = RTE
if callstack.num_callstacks > i:
rec_types.append((2, callstack, hdbcons, comman)) # 2 = CallStacks
if gstack.num_gstacks > i:
rec_types.append((3, gstack, hdbcons, comman)) # 3 = GStacks
if kprofiler.num_kprofs > i:
rec_types.append((4, kprofiler, hdbcons, comman)) # 4 = Kernel Profiler
results = pool.map(parallel_recording_wrapper, rec_types)
if comman.std_out:
for j in range(len(results)):
log(results[j], comman)
pool.close()
pool.join()
def parallel_recording_wrapper(rec_types):
return parallel_recording(*rec_types)
def parallel_recording(record_type, recorder, hdbcons, comman):
if record_type == 1:
return record_rtedump(recorder.rtedumps_interval, hdbcons, CommunicationManager(comman.dbuserkey, comman.out_dir, False, comman.hdbsql_string, comman.log_features))
elif record_type == 2:
return record_callstack(recorder.callstacks_interval, hdbcons, CommunicationManager(comman.dbuserkey, comman.out_dir, False, comman.hdbsql_string, comman.log_features))
elif record_type == 3:
return record_gstack(recorder.gstacks_interval, CommunicationManager(comman.dbuserkey, comman.out_dir, False, comman.hdbsql_string, comman.log_features))
else:
return record_kprof(recorder, hdbcons, CommunicationManager(comman.dbuserkey, comman.out_dir, False, comman.hdbsql_string, comman.log_features))
def tracker(ping_timeout, check_interval, recording_mode, rte, callstack, gstack, kprofiler, recording_prio, critical_features, feature_check_timeout, cpu_check_params, minRetainedLogDays, comman, hdbcons):
recorded = False
offline = False
while not recorded:
# CPU CHECK
if cpu_too_high(cpu_check_params, comman): #first check CPU with 'sar' (i.e. without contacting HANA) if it is too high, record without pinging or feature checking
recorded = record(recording_mode, rte, callstack, gstack, kprofiler, recording_prio, hdbcons, comman)
if not recorded:
# PING CHECK - to find either hanging or offline situations
start_time = datetime.now()
[hanging, offline] = hana_ping(ping_timeout, comman)
stop_time = datetime.now()
if offline:
comment = "DB is offline, will exit the tracker"
elif hanging:
comment = "No response from DB within "+str(ping_timeout)+" seconds"
else:
comment = "DB responded faster than "+str(ping_timeout)+" seconds"
log("Ping Check , "+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+" , "+str(stop_time-start_time)+" , - , "+str(not hanging and not offline)+" , "+comment, comman, sendEmail = not hanging and not offline)
if hanging:
recorded = record(recording_mode, rte, callstack, gstack, kprofiler, recording_prio, hdbcons, comman)
if offline:
return [recorded, offline] # exit the tracker if HANA turns offline during tracking
if not recorded:
# FEATURE CHECK - only done if recording has not already been done from either the CPU check or from the Ping check
chid = 0
for cf in critical_features:
if not recorded: #No hang situation or critical feature situation happened yet, so check for a critical feature
nbrCriticalFeatures = [-1]
critical_feature_info = [""]
chid += 1
start_time = datetime.now()
t = Timer(0.1,feature_check,[cf, nbrCriticalFeatures, critical_feature_info, comman])
t.start()
t.join((feature_check_timeout + cf.interval)*cf.nbrIterations)
hanging = t.is_alive()
if hanging:
info_message = "Hang situation during feature-check detected"
else:
wrong_number_critical_features = (cf.limitIsMinimumNumberCFAllowed and nbrCriticalFeatures[0] < cf.limit) or (not cf.limitIsMinimumNumberCFAllowed and nbrCriticalFeatures[0] > cf.limit)
if cf.limitIsMinimumNumberCFAllowed:
info_message = "# Critical Features = "+str(nbrCriticalFeatures[0])+" (minimum required = "+str(cf.limit)+"), Check if "+cf.whereClauseDescription
else:
info_message = "# Critical Features = "+str(nbrCriticalFeatures[0])+" (maximum allowed = "+str(cf.limit)+"), Check if "+cf.whereClauseDescription
stop_time = datetime.now()
printout = "Feature Check "+str(chid)+" , "+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+" , "+str(stop_time-start_time)+" , "+str(not hanging)+" , "+str(not wrong_number_critical_features)+" , "+info_message
log(printout, comman, sendEmail = wrong_number_critical_features)
if comman.log_features:
log(critical_feature_info[0], CommunicationManager(comman.dbuserkey, comman.out_dir, False, comman.hdbsql_string, comman.log_features), "criticalFeatures")
if hanging or wrong_number_critical_features:
if cf.killSession:
stop_session(cf, comman)
recorded = record(recording_mode, rte, callstack, gstack, kprofiler, recording_prio, hdbcons, comman)
if not recorded:
time.sleep(check_interval)
if minRetainedLogDays >= 0: # automatic house keeping of hanasitter logs
nCleaned = clean_logs(minRetainedLogDays, comman)
log(str(nCleaned)+" hanasitter daily log files were removed", comman)
return [recorded, offline]
def cdalias(alias, local_dbinstance): # alias e.g. cdtrace, cdhdb, ...
command_run = subprocess.check_output(['/bin/bash', '-i', '-c', "alias "+alias]).split("alias")[1]
pieces = command_run.strip("\n").strip(" "+alias+"=").strip("'").strip("cd ").split("/")
path = ''
for piece in pieces:
if piece and piece[0] == '$':
piece = (subprocess.check_output(['/bin/bash', '-i', '-c', "echo "+piece])).strip("\n")
path = path + '/' + piece + '/'
path = path.replace("[0-9][0-9]", local_dbinstance) # if /bin/bash shows strange HDB[0-9][0-9] we force correct instance on it
return path
def log(message, comman, file_name = "", sendEmail = False):
if comman.std_out:
print message
if file_name == "":
file_name = "hanasitterlog"
logfile = open(comman.out_dir+"/"+file_name+"_"+datetime.now().strftime("%Y-%m-%d"+".txt").replace(" ", "_"), "a")
logfile.write(message+"\n")
logfile.flush()
logfile.close()
global emailNotification
if sendEmail and emailNotification: #sends email IF this call of log() wants it AND IF -en flag has been specified
#MAILX (https://www.systutorials.com/5167/sending-email-using-mailx-in-linux-through-internal-smtp/):
mailstring = 'echo "'+message+'" | mailx -s "Message from HANASitter about '+emailNotification.SID+'" -S smtp=smtp://'+emailNotification.mailServer+' -S from="'+emailNotification.senderEmail+'" '+emailNotification.recieverEmail
#print mailstring
output = subprocess.check_output(mailstring, shell=True)
def main():
##################### CHECK PYTHON VERSION ###########
if sys.version_info[0] != 2 or sys.version_info[1] != 7:
print "VERSION ERROR: hanacleaner is only supported for Python 2.7.x. Did you maybe forget to log in as <sid>adm before executing this?"
os._exit(1)
##################### DEFAULTS ####################
online_test_interval = 3600 #seconds
ping_timeout = 60 #seconds
check_interval = 60 #seconds
recording_mode = 1 # either 1, 2 or 3
recording_prio = ['1', '2', '3', '4'] # 1=RTE, 2=CallStacks, 3=GStacks, 4=Kernel Profiler
num_rtedumps = 0 #how many rtedumps?
rtedumps_interval = 60 #seconds
rte_mode = 0 # either 0 or 1
num_callstacks = 0 #how many call stacks?
callstacks_interval = 60 #seconds
num_gstacks = 0 #how many call stacks?
gstacks_interval = 60 #seconds
num_kprofs = 0 #how many kernel profiler traces?
kprofs_interval = 60 #seconds
kprofs_duration = 60 #seconds
kprofs_wait = 0 #milliseconds
feature_check_timeout = 60 #seconds
#critical_features = ['M_SERVICE_THREADS','IS_ACTIVE','TRUE','30'] #one critical feature state with max allowed 30
critical_features = [] # default: don't use critical feature check
kill_session = [] # default: do not kill any session
intervals_of_features = [] #default only one check per feature
after_recorded = -1 #default: exits after recorded
std_out = "true" #print to std out
out_dir = "/tmp/hanasitter_output"
minRetainedLogDays = -1 #automatic cleanup of hanasitterlog
flag_file = "" #default: no configuration input file
log_features = "false"
email_notification = None
ssl = "false"
virtual_local_host = "" #default: assume physical local host
dbuserkey = 'SYSTEMKEY' # This KEY has to be maintained in hdbuserstore
# so that hdbuserstore LIST gives e.g.
# KEY SYSTEMKEY
# ENV : mo-fc8d991e0:30015
# USER: SYSTEM
cpu_check_params = ['0', '0','0','100'] # by default no cpu check
##################### CHECK INPUT ARGUMENTS #################
if len(sys.argv) == 1:
print "INPUT ERROR: hanasitter needs input arguments. Please see --help for more information."
os._exit(1)
if len(sys.argv) != 2 and len(sys.argv) % 2 == 0:
print "INPUT ERROR: Wrong number of input arguments. Please see --help for more information."
os._exit(1)
for i in range(len(sys.argv)):
if i % 2 != 0:
if sys.argv[i][0] != '-':
print "INPUT ERROR: Every second argument has to be a flag, i.e. start with -. Please see --help for more information."
os._exit(1)
##################### PRIMARY INPUT ARGUMENTS ####################
if '-h' in sys.argv or '--help' in sys.argv:
printHelp()
if '-d' in sys.argv or '--disclaimer' in sys.argv:
printDisclaimer()
if '-ff' in sys.argv:
flag_file = sys.argv[sys.argv.index('-ff') + 1]
############ CONFIGURATION FILE ###################
if flag_file:
with open(flag_file, 'r') as fin:
for line in fin:
firstWord = line.strip(' ').split(' ')[0]
if firstWord[0:1] == '-':
flagValue = line.strip(' ').split('"')[1].strip('\n').strip('\r') if line.strip(' ').split(' ')[1][0] == '"' else line.strip(' ').split(' ')[1].strip('\n').strip('\r')
if firstWord == '-oi':
online_test_interval = flagValue
if firstWord == '-pt':
ping_timeout = flagValue
if firstWord == '-ci':
check_interval = flagValue
if firstWord == '-rm':
recording_mode = flagValue
if firstWord == '-rp':
recording_prio = [x for x in flagValue.split(',')]
if firstWord == '-nr':
num_rtedumps = flagValue
if firstWord == '-ir':
rtedumps_interval = flagValue
if firstWord == '-mr':
rte_mode = flagValue
if firstWord == '-ks':
kill_session = [x.strip('"') for x in flagValue.split(',')]
if firstWord == '-nc':
num_callstacks = flagValue
if firstWord == '-ic':
callstacks_interval = flagValue
if firstWord == '-ng':
num_gstacks = flagValue
if firstWord == '-ig':
gstacks_interval = flagValue
if firstWord == '-np':
num_kprofs = flagValue
if firstWord == '-ip':
kprofs_interval = flagValue
if firstWord == '-dp':
kprofs_duration = flagValue
if firstWord == '-wp':
kprofs_wait = flagValue
if firstWord == '-cf':
critical_features = [x.strip('"') for x in flagValue.split(',')]
if firstWord == '-if':
intervals_of_features = [x.strip('"') for x in flagValue.split(',')]
if firstWord == '-tf':
feature_check_timeout = flagValue
if firstWord == '-ar':
after_recorded = flagValue
if firstWord == '-od':
out_dir = flagValue
if firstWord == '-or':
minRetainedLogDays = flagValue
if firstWord == '-lf':
log_features = flagValue
if firstWord == '-en':
email_notification = [x for x in flagValue.split(',')]
if firstWord == '-so':
std_out = flagValue
if firstWord == '-ssl':
ssl = flagValue
if firstWord == '-vlh':
virtual_local_host = flagValue
if firstWord == '-k':
dbuserkey = flagValue
if firstWord == '-cpu':
cpu_check_params = [x for x in flagValue.split(',')]
##################### INPUT ARGUMENTS (these would overwrite whats in the configuration file) ####################
if '-oi' in sys.argv:
online_test_interval = sys.argv[sys.argv.index('-oi') + 1]
if '-pt' in sys.argv:
ping_timeout = sys.argv[sys.argv.index('-pt') + 1]
if '-ci' in sys.argv:
check_interval = sys.argv[sys.argv.index('-ci') + 1]
if '-rm' in sys.argv:
recording_mode = sys.argv[sys.argv.index('-rm') + 1]
if '-rp' in sys.argv:
recording_prio = [x for x in sys.argv[ sys.argv.index('-rp') + 1 ].split(',')]
if '-nr' in sys.argv:
num_rtedumps = sys.argv[sys.argv.index('-nr') + 1]
if '-ir' in sys.argv:
rtedumps_interval = sys.argv[sys.argv.index('-ir') + 1]
if '-mr' in sys.argv:
rte_mode = sys.argv[sys.argv.index('-mr') + 1]
if '-ks' in sys.argv:
kill_session = [x.strip('"') for x in sys.argv[ sys.argv.index('-ks') + 1 ].split(',')]
if '-nc' in sys.argv:
num_callstacks = sys.argv[sys.argv.index('-nc') + 1]
if '-ic' in sys.argv:
callstacks_interval = sys.argv[sys.argv.index('-ic') + 1]
if '-ng' in sys.argv:
num_gstacks = sys.argv[sys.argv.index('-ng') + 1]
if '-ig' in sys.argv:
gstacks_interval = sys.argv[sys.argv.index('-ig') + 1]
if '-np' in sys.argv:
num_kprofs = sys.argv[sys.argv.index('-np') + 1]
if '-ip' in sys.argv:
kprofs_interval = sys.argv[sys.argv.index('-ip') + 1]
if '-dp' in sys.argv:
kprofs_duration = sys.argv[sys.argv.index('-dp') + 1]
if '-wp' in sys.argv:
kprofs_wait = sys.argv[sys.argv.index('-wp') + 1]
if '-cf' in sys.argv:
critical_features = [x.strip('"') for x in sys.argv[ sys.argv.index('-cf') + 1 ].split(',')]
if critical_features == ['']: # allow no critical feature with -cf ""
critical_features = [] # make the length 0 in case of -cf ""
if '-if' in sys.argv:
intervals_of_features = [x.strip('"') for x in sys.argv[ sys.argv.index('-if') + 1 ].split(',')]
if '-tf' in sys.argv:
feature_check_timeout = sys.argv[sys.argv.index('-tf') + 1]
if '-ar' in sys.argv:
after_recorded = sys.argv[sys.argv.index('-ar') + 1]
if '-od' in sys.argv:
out_dir = sys.argv[sys.argv.index('-od') + 1]
if '-or' in sys.argv:
minRetainedLogDays = sys.argv[sys.argv.index('-or') + 1]
if '-lf' in sys.argv:
log_features = sys.argv[sys.argv.index('-lf') + 1]
if '-so' in sys.argv:
std_out = int(sys.argv[sys.argv.index('-so') + 1])
if '-en' in sys.argv:
email_notification = [x for x in sys.argv[ sys.argv.index('-en') + 1 ].split(',')]
if '-ssl' in sys.argv:
ssl = sys.argv[sys.argv.index('-ssl') + 1]
if '-vlh' in sys.argv:
virtual_local_host = sys.argv[sys.argv.index('-vlh') + 1]
if '-k' in sys.argv:
dbuserkey = sys.argv[sys.argv.index('-k') + 1]
if '-cpu' in sys.argv:
cpu_check_params = [x for x in sys.argv[ sys.argv.index('-cpu') + 1 ].split(',')]
############ GET LOCAL HOST, LOCAL SQL PORT, LOCAL INSTANCE and SID ##########
local_host = subprocess.check_output("hostname", shell=True).replace('\n','') if virtual_local_host == "" else virtual_local_host
key_environment = subprocess.check_output('''hdbuserstore LIST '''+dbuserkey, shell=True)
if "NOT FOUND" in key_environment:
print "ERROR, the key ", dbuserkey, " is not maintained in hdbuserstore."
os._exit(1)
ENV = key_environment.split('\n')[1].replace(' ENV : ','').split(',')
key_hosts = [env.split(':')[0] for env in ENV]
if not local_host in key_hosts:
print "ERROR, local host, ", local_host, ", should be one of the hosts specified for the key, ", dbuserkey, " (in case of virtual, please use -vlh, see --help for more info)"
os._exit(1)
local_host_index = key_hosts.index(local_host)
key_sqlports = [env.split(':')[1] for env in ENV]
local_sqlport = key_sqlports[local_host_index]
dbinstances = [port[1:3] for port in key_sqlports]
if not all(x == dbinstances[0] for x in dbinstances):
print "ERROR: The hosts provided with the user key, "+dbuserkey+", does not all have the same instance number"
os._exit(1)
local_dbinstance = dbinstances[local_host_index]
SID = subprocess.check_output('whoami', shell=True).replace('\n','').replace('adm','').upper()
### MDC or not, SystemDB or Tenant ###
tenantIndexserverPorts = [] # First assume non-mdc, if it finds tenant ports then it is mdc
output = subprocess.check_output('HDB info', shell=True).splitlines(1)
tenantIndexserverPorts = [line.split(' ')[-1].strip('\n') for line in output if "hdbindexserver -port" in line]
tenantDBNames = [line.split(' ')[0].replace('adm','').upper() for line in output if "hdbindexserver -port" in line] # only works if high-isolated
is_mdc = len(tenantIndexserverPorts) > 0
output = subprocess.check_output('ls -l '+cdalias('cdhdb', local_dbinstance)+local_host+'/lock', shell=True).splitlines(1)
nameserverPort = [line.split('@')[1].replace('.pid','') for line in output if "hdbnameserver" in line][0].strip('\n')
### TENANT NAMES for NON HIGH-ISOLATED MDC ###
if is_mdc:
if tenantDBNames.count(tenantDBNames[0]) == len(tenantDBNames) and tenantDBNames[0] == SID: # if all tenant names are equal and equal to SystemDB's SID, then it is non-high-isolation --> get tenant names using daemon instead
[tenantDBNames, tenantIndexserverPorts] = tenant_names_and_ports(cdalias('cdhdb', local_dbinstance)+local_host+"/daemon.ini") # if non-high isolation the tenantIndexserverPorts from HDB info could be wrong order
####### COMMUNICATION PORT (i.e. nameserver port if SystemDB at MDC, or indexserver port if TenantDB and if non-MDC) ########
communicationPort = "-1"
tenantDBName = None
is_tenant = False
if is_mdc: