Skip to content

Commit 55aed04

Browse files
committed
QA: More tuning
Signed-off-by: TheWitness <[email protected]>
1 parent 0f35612 commit 55aed04

File tree

2 files changed

+104
-81
lines changed

2 files changed

+104
-81
lines changed

cacti/plugins/lsfenh/lib/analytics.php

Lines changed: 98 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -706,11 +706,11 @@ function grid_collect_pend($clusterid) {
706706
}
707707

708708
// Get summary pending reasons
709-
$reasons = shell_exec($config['base_path'] . '/plugins/grid/bin/bjobs -psum -p1 -uall 2>&1');
709+
$reasons = shell_exec($config['base_path'] . '/plugins/lsfenh/bin/bjobs -psum -p1 -uall 2>&1');
710710
grid_process_reasons($clusterid, $reasons, 'cluster', 0);
711711

712712
// Get summary suspended reasons
713-
$suspend = shell_exec($config['base_path'] . '/plugins/grid/bin/bjobs -uall -s 2>&1');
713+
$suspend = shell_exec($config['base_path'] . '/plugins/lsfenh/bin/bjobs -uall -s 2>&1');
714714

715715
$pend = microtime(true);
716716

@@ -775,7 +775,7 @@ function grid_collect_sla_reasons($sla, $clusterid) {
775775

776776
// Get summary pending reasons
777777
if ($remote != 'on') {
778-
$reasons = shell_exec($config['base_path'] . "/plugins/grid/bin/bjobs -psum -p1 -uall -sla $sla 2>&1");
778+
$reasons = shell_exec($config['base_path'] . "/plugins/lsfenh/bin/bjobs -psum -p1 -uall -sla $sla 2>&1");
779779
grid_process_reasons($clusterid, $reasons, 'sla_' . $sla, 0);
780780
}
781781
}
@@ -785,7 +785,7 @@ function grid_update_job_groups($clusterid) {
785785

786786
$job_lines = array();
787787
$slot_lines = array();
788-
$bjgroup_jobs = shell_exec($config['base_path'] . "/plugins/grid/bin/bjgroup -s 2>&1");
788+
$bjgroup_jobs = shell_exec($config['base_path'] . "/plugins/lsfenh/bin/bjgroup -s 2>&1");
789789
$jsql = $ssql = array();
790790

791791
grid_debug('Job Group Length ' . strlen($bjgroup_jobs) . ' Job Group Length in ClusterID:' . $clusterid);
@@ -794,7 +794,7 @@ function grid_update_job_groups($clusterid) {
794794
$job_lines = explode("\n", $bjgroup_jobs);
795795

796796
if (strpos($job_lines[0], 'No job group found') === false && strpos($job_lines[0], 'Failed in an LSF library') === false) {
797-
$bjgroup_slots = shell_exec($config['base_path'] . "/plugins/grid/bin/bjgroup -N 2>&1");
797+
$bjgroup_slots = shell_exec($config['base_path'] . "/plugins/lsfenh/bin/bjgroup -N 2>&1");
798798

799799
if ($bjgroup_slots != '') {
800800
$slot_lines = explode("\n", $bjgroup_slots);
@@ -835,6 +835,7 @@ function grid_update_job_groups($clusterid) {
835835
(clusterid, groupName, numJobs, pendJobs, runJobs, suspJobs, finishJobs, sla, limitUsed, limitTotal, owner, last_updated, present) VALUES ';
836836

837837
$errors = 0;
838+
$logged = false;
838839

839840
if (cacti_sizeof($job_lines)) {
840841
foreach($job_lines as $line) {
@@ -848,10 +849,16 @@ function grid_update_job_groups($clusterid) {
848849

849850
$parts = preg_split('/[\s]+/', $line);
850851

851-
if (cacti_sizeof($parts) != 10) {
852+
if (cacti_sizeof($parts) > 10) {
852853
$nparts = cacti_sizeof($parts);
853-
cacti_log("Job: Elements: $nparts, $line", false, 'LSFENH');
854+
855+
if (!$logged) {
856+
cacti_log("Job: Cluster: $clusterid Elements: $nparts, $line", false, 'LSFENH');
857+
$logged = true;
858+
}
859+
854860
$errors++;
861+
855862
continue;
856863
}
857864

@@ -901,6 +908,8 @@ function grid_update_job_groups($clusterid) {
901908
$sprefix = 'INSERT INTO grid_job_groups
902909
(clusterid, groupName, numSlots, pendSlots, runSlots, suspSlots, rsvSlots, last_updated, present) VALUES ';
903910

911+
$logged = false;
912+
904913
if (cacti_sizeof($slot_lines)) {
905914
foreach($slot_lines as $line) {
906915
$line = trim($line);
@@ -913,10 +922,16 @@ function grid_update_job_groups($clusterid) {
913922

914923
$parts = preg_split('/[\s]+/', $line);
915924

916-
if (cacti_sizeof($parts) != 9) {
925+
if (cacti_sizeof($parts) > 10) {
917926
$nparts = cacti_sizeof($parts);
918-
cacti_log("Slot: Elements: $nparts, $line", false, 'LSFENH');
927+
928+
if (!$logged) {
929+
cacti_log("Slot: Cluster: $clusterid Elements: $nparts, $line", false, 'LSFENH');
930+
$logged = true;
931+
}
932+
919933
$errors++;
934+
920935
continue;
921936
}
922937

@@ -963,11 +978,11 @@ function grid_update_job_groups($clusterid) {
963978
function grid_hostgroup_sla_definitions($clusterid) {
964979
global $config;
965980

966-
$bmgroup = shell_exec($config['base_path'] . "/plugins/grid/bin/bmgroup -w 2>&1");
981+
$bmgroup = shell_exec($config['base_path'] . "/plugins/lsfenh/bin/bmgroup -w 2>&1");
967982

968983
grid_process_bmgroup($clusterid, $bmgroup);
969984

970-
$bresources = shell_exec($config['base_path'] . "/plugins/grid/bin/bresources -g 2>&1");
985+
$bresources = shell_exec($config['base_path'] . "/plugins/lsfenh/bin/bresources -g 2>&1");
971986

972987
if ($bresources != '') {
973988
$lines = explode("\n", $bresources);
@@ -991,7 +1006,7 @@ function grid_hostgroup_sla_definitions($clusterid) {
9911006
function grid_process_bresources($clusterid, $group) {
9921007
global $config;
9931008

994-
$bresources = shell_exec($config['base_path'] . "/plugins/grid/bin/bresources -g -l $group 2>&1 | grep 'HOSTS:'");
1009+
$bresources = shell_exec($config['base_path'] . "/plugins/lsfenh/bin/bresources -g -l $group 2>&1 | grep 'HOSTS:'");
9951010

9961011
if ($bresources != '') {
9971012
if (!db_column_exists('grid_guarantee_pool', 'hosts')) {
@@ -1334,7 +1349,7 @@ function grid_collect_shared_descriptions($clusterid) {
13341349

13351350
// Get summary pending reasons
13361351
if ($remote != 'on') {
1337-
$lsinfo = shell_exec($config['base_path'] . "/plugins/grid/bin/lsinfo -w 2>&1");
1352+
$lsinfo = shell_exec($config['base_path'] . "/plugins/lsfenh/bin/lsinfo -w 2>&1");
13381353
grid_process_lsinfo($clusterid, $lsinfo);
13391354
}
13401355
}
@@ -1390,8 +1405,12 @@ function grid_process_lsinfo($clusterid, &$lsinfo) {
13901405
}
13911406

13921407
function grid_translate_reason($reason) {
1408+
// Record the original reason
1409+
$oreason = $reason;
1410+
13931411
// Clean up nasty long pending reasons and case issues
13941412
$reason = str_replace('Resource limit defined on', '', $reason);
1413+
$reason = str_replace('Jobs requirements for resource reservation not satisfied', 'Job Reservation not satisfied', $reason);
13951414
$reason = str_replace('(Resource:', '(Res:', $reason);
13961415
$reason = str_replace('Limit Name:', 'Name:', $reason);
13971416
$reason = str_replace('Limit Value:', 'Value:', $reason);
@@ -1404,7 +1423,35 @@ function grid_translate_reason($reason) {
14041423
$reason = str_replace("'", '', $reason);
14051424
$reason = str_replace('has been reached', 'Limit Reached', $reason);
14061425
$reason = str_replace('has reached', 'Reached', $reason);
1407-
$reason = str_replace('requirements for resource reservation not satisfied', 'Resource Reservation not satisfied', $reason);
1426+
1427+
// Host Reasons are fist
1428+
if (str_contains($reason, ' (Host:')) {
1429+
$reason = db_fetch_cell("SELECT TRIM(SUBSTRING_INDEX(reason,' (Host:',1)) AS reason
1430+
FROM (SELECT " . db_qstr($reason) . " AS reason HAVING reason LIKE '% (Host:%') AS rs");
1431+
}
1432+
1433+
// Job level Reasons
1434+
if (str_contains($reason, 'job <')) {
1435+
$reason = db_fetch_cell("SELECT TRIM(SUBSTRING_INDEX(reason,'<', 1)) AS reason,
1436+
FROM (SELECT " . db_qstr($reason) . " AS reason HAVING reason LIKE '%job <%') AS rs");
1437+
}
1438+
1439+
1440+
// Limit Value
1441+
if (str_contains($reason, ' (Limit Value:')) {
1442+
$reason = db_fetch_cell("SELECT TRIM(SUBSTRING_INDEX(reason,' (Limit Value:', 1)) AS reason,
1443+
FROM (SELECT " . db_qstr($reason) . " AS reason HAVING reason LIKE '% (Limit Value:%') AS rs");
1444+
}
1445+
1446+
// Remaining pending reasons
1447+
if (str_contains($reason, ' (Limit Value:')) {
1448+
$reason = db_fetch_cell("SELECT TRIM(REPLACE(REPLACE(reason, \"'\", \"\"), 'Limit: ', '')) AS reason,
1449+
FROM (SELECT " . db_qstr($reason) . " AS reason
1450+
HAVING reason NOT LIKE '% (Limit Value:%'
1451+
AND reason NOT LIKE '% (Host:%'
1452+
AND reason NOT LIKE '%job <%') AS rs");
1453+
}
1454+
14081455
$reason = str_replace(
14091456
array(
14101457
'job', 'dependency', 'condition', 'user', 'host', 'group', 'guarantee',
@@ -1416,6 +1463,9 @@ function grid_translate_reason($reason) {
14161463
), $reason
14171464
);
14181465

1466+
// Debugging
1467+
//cacti_log("O:'$oreason' T:'$reason'");
1468+
14191469
return trim($reason);
14201470
}
14211471

@@ -1518,73 +1568,41 @@ function grid_aggregate_reasons() {
15181568
}
15191569
}
15201570

1521-
$sql_where = ' AND clusterid NOT IN (' . implode(',', $exclude_clusters) . ')';
1571+
$sql_where = ' WHERE clusterid NOT IN (' . implode(',', $exclude_clusters) . ')';
15221572
}
15231573

1524-
// Host Reasons are fist
1525-
db_execute("INSERT INTO grid_jobs_reason_summary
1526-
(clusterid, issusp, level, type, reason, jobs_occurrences, present, last_updated)
1527-
SELECT clusterid, issusp, level, type,
1528-
TRIM(SUBSTRING_INDEX(reason,' (Host:',1)) AS reason,
1529-
SUM(jobs_occurrences) AS jobs_occurrences,
1530-
'1' AS present, MAX(last_updated) AS last_updated
1531-
FROM grid_jobs_reason_details
1532-
WHERE reason LIKE '% (Host:%'
1533-
$sql_where
1534-
GROUP BY clusterid, issusp, level, type, TRIM(SUBSTRING_INDEX(reason, ' (Host:',1))
1535-
ON DUPLICATE KEY UPDATE
1536-
jobs_occurrences = VALUES(jobs_occurrences),
1537-
last_updated = VALUES(last_updated),
1538-
present = 1");
1574+
$reasons = db_fetch_assoc("SELECT * FROM grid_jobs_reason_details $sql_where");
15391575

1540-
// Job level Reasons
1541-
db_execute("INSERT INTO grid_jobs_reason_summary
1542-
(clusterid, issusp, level, type, reason, jobs_occurrences, present, last_updated)
1543-
SELECT clusterid, issusp, level, type,
1544-
TRIM(SUBSTRING_INDEX(reason,'<', 1)) AS reason,
1545-
SUM(jobs_occurrences) AS jobs_occurrences,
1546-
'1' AS present, MAX(last_updated) AS last_updated
1547-
FROM grid_jobs_reason_details
1548-
WHERE reason LIKE '%job <%'
1549-
$sql_where
1550-
GROUP BY clusterid, issusp, level, type, TRIM(SUBSTRING_INDEX(reason, 'job <',1))
1551-
ON DUPLICATE KEY UPDATE
1552-
jobs_occurrences = VALUES(jobs_occurrences),
1553-
last_updated = VALUES(last_updated),
1554-
present = 1");
1576+
$num_reasons = cacti_sizeof($reasons);
15551577

1556-
// Limit Value
1557-
db_execute("INSERT INTO grid_jobs_reason_summary
1558-
(clusterid, issusp, level, type, reason, jobs_occurrences, present, last_updated)
1559-
SELECT clusterid, issusp, level, type,
1560-
TRIM(SUBSTRING_INDEX(reason,' (Limit Value:', 1)) AS reason,
1561-
SUM(jobs_occurrences) AS jobs_occurrences,
1562-
'1' AS present, MAX(last_updated) AS last_updated
1563-
FROM grid_jobs_reason_details
1564-
WHERE reason LIKE '% (Limit Value:%'
1565-
$sql_where
1566-
GROUP BY clusterid, issusp, level, type, TRIM(SUBSTRING_INDEX(reason, ' (Limit Value:',1))
1567-
ON DUPLICATE KEY UPDATE
1568-
jobs_occurrences = VALUES(jobs_occurrences),
1569-
last_updated = VALUES(last_updated),
1570-
present = 1");
1578+
/* translate the reasons into something readable */
1579+
if (cacti_sizeof($reasons)) {
1580+
foreach($reasons as $index => $r) {
1581+
$reasons[$index]['reason'] = grid_translate_reason($r['reason']);
1582+
}
1583+
}
15711584

1572-
// Remaining pending reasons
1573-
db_execute("INSERT INTO grid_jobs_reason_summary
1574-
(clusterid, issusp, level, type, reason, jobs_occurrences, present, last_updated)
1575-
SELECT clusterid, issusp, level, type, TRIM(REPLACE(REPLACE(reason, \"'\", \"\"), 'Limit: ', '')) AS reason,
1576-
jobs_occurrences, present, last_updated
1577-
FROM grid_jobs_reason_details
1578-
WHERE reason NOT LIKE '% (Limit Value:%'
1579-
AND reason NOT LIKE '% (Host:%'
1580-
AND reason NOT LIKE '%job <%'
1581-
$sql_where
1582-
ON DUPLICATE KEY UPDATE
1583-
jobs_occurrences = VALUES(jobs_occurrences),
1584-
last_updated = VALUES(last_updated),
1585-
present = 1");
1585+
$format = array(
1586+
'clusterid',
1587+
'issusp',
1588+
'level',
1589+
'type',
1590+
'reason',
1591+
'jobs_occurrences',
1592+
'present',
1593+
'last_updated'
1594+
);
1595+
1596+
$duplicate = " ON DUPLICATE KEY UPDATE
1597+
jobs_occurrences = VALUES(jobs_occurrences),
1598+
last_updated = VALUES(last_updated),
1599+
present = 1";
1600+
1601+
grid_pump_records($reasons, 'grid_jobs_reason_summary', $format, false, $duplicate);
15861602

15871603
db_execute('UPDATE grid_jobs_reason_summary SET jobs_occurrences = 0 WHERE present = 0');
1604+
1605+
return $num_reasons;
15881606
}
15891607

15901608
function grid_collect_jobs_remote($clusterid) {
@@ -1764,7 +1782,7 @@ function grid_collect_jobs($clusterid) {
17641782

17651783
cacti_log('NOTE: Bjobs command starting for ClusterID:' . $clusterid, false, 'LSFENH', POLLER_VERBOSITY_MEDIUM);
17661784

1767-
$jobs = shell_exec($config['base_path'] . "/plugins/grid/bin/bjobs -uall -o 'jobid jobindex user project app stat queue sla:60 first_host slots min_req_proc max_req_proc cpu_used max_mem mem run_time submit_time start_time finish_time effective_resreq combined_resreq' -json 2>/dev/null");
1785+
$jobs = shell_exec($config['base_path'] . "/plugins/lsfenh/bin/bjobs -uall -o 'jobid jobindex user project app stat queue sla:60 first_host slots min_req_proc max_req_proc cpu_used max_mem mem run_time submit_time start_time finish_time effective_resreq combined_resreq' -json 2>/dev/null");
17681786

17691787
if ($jobs != '') {
17701788
$jobs = json_decode($jobs, true);
@@ -2012,7 +2030,7 @@ function grid_collect_jobs($clusterid) {
20122030
function grid_planner_jobs($clusterid) {
20132031
global $config;
20142032

2015-
$jobs = file(shell_exec($config['base_path'] . "/plugins/grid/bin/bjobs -uall -UF -plan 2>/dev/null"));
2033+
$jobs = file(shell_exec($config['base_path'] . "/plugins/lsfenh/bin/bjobs -uall -UF -plan 2>/dev/null"));
20162034
}
20172035

20182036
function grid_update_djob_stats($clusterid, $time) {
@@ -2026,6 +2044,7 @@ function grid_update_djob_stats($clusterid, $time) {
20262044
$start = time();
20272045

20282046
$format = array('clusterid', 'host', 'mem_reserved', 'mem_used', 'runJobs', 'maxJobs', 'maxMemory');
2047+
20292048
$duplicate = 'ON DUPLICATE KEY UPDATE
20302049
mem_reserved = IF(VALUES(maxMemory) < VALUES(mem_reserved) AND VALUES(maxMemory) > 0, VALUES(maxMemory), VALUES(mem_reserved)),
20312050
mem_used = VALUES(mem_used),
@@ -2082,6 +2101,7 @@ function grid_update_djob_stats($clusterid, $time) {
20822101

20832102
// Stats by SLA
20842103
$format = array('clusterid', 'sla', 'dmem_reserved', 'dmem_used', 'djob_efficiency', 'djob_cputime', 'djob_walltime', 'present');
2104+
20852105
$duplicate = 'ON DUPLICATE KEY UPDATE
20862106
dmem_reserved = VALUES(dmem_reserved),
20872107
dmem_used = VALUES(dmem_used),
@@ -2110,6 +2130,7 @@ function grid_update_djob_stats($clusterid, $time) {
21102130

21112131
// SLA Throughput
21122132
$format = array('clusterid', 'sla', 'hourly_started_jobs', 'hourly_done_jobs', 'hourly_exit_jobs', 'present');
2133+
21132134
$duplicate = 'ON DUPLICATE KEY UPDATE
21142135
hourly_done_jobs = VALUES(hourly_done_jobs),
21152136
hourly_exit_jobs = VALUES(hourly_exit_jobs),
@@ -2133,6 +2154,7 @@ function grid_update_djob_stats($clusterid, $time) {
21332154

21342155
// Host Loaning Phase 1
21352156
$format = array('clusterid', 'sla', 'hostl_hosts_total', 'hostl_hosts_busy', 'hostl_slots_total', 'hostl_slots_used', 'hostl_slots_busy', 'present');
2157+
21362158
$duplicate = 'ON DUPLICATE KEY UPDATE
21372159
hostl_hosts_total = VALUES(hostl_hosts_total),
21382160
hostl_hosts_busy = VALUES(hostl_hosts_busy),

cacti/plugins/lsfenh/poller_lsfenh.php

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,11 @@
3838
ini_set('memory_limit', '-1');
3939
ini_set('max_execution_time', '0');
4040

41-
global $debug, $start, $force, $rrdtool;
41+
global $debug, $start, $force;
4242

4343
$debug = false;
4444
$force = false;
4545

46-
$rrdtool = read_config_option('path_rrdtool');
47-
4846
/* we need long group concats */
4947
db_execute('SET SESSION group_concat_max_len = 1000000');
5048

@@ -92,6 +90,9 @@
9290

9391
$poller_interval = read_config_option('poller_interval');
9492

93+
/* create required tables */
94+
create_required_tables();
95+
9596
if ($force || grid_detect_and_correct_running_processes(0, 'LSFENHJOBS', '300')) {
9697
// Run background jobs collectors
9798
run_background_jobs_collectors();
@@ -114,7 +115,7 @@
114115
make_ls_features_key_features();
115116

116117
// Aggregate Reasons for Graphing
117-
grid_aggregate_reasons();
118+
$num_reasons = grid_aggregate_reasons();
118119

119120
// Update cpu counts by hostModel
120121
update_hosts_model_stats();
@@ -127,7 +128,7 @@
127128

128129
$end = microtime(true);
129130

130-
cacti_log(sprintf('LSFENH Core STATS: Time:%4.2f', $end - $start), false, 'SYSTEM');
131+
cacti_log(sprintf('LSFENH Core STATS: Time:%4.2f Reasons:%d', $end - $start, $num_reasons), false, 'SYSTEM');
131132

132133
exit(0);
133134
} else {

0 commit comments

Comments
 (0)