commit 1e216d84c6b95b4f9cb7ee6b5adf9591f8af37d5 Author: Nathan Scott Date: Wed Dec 7 11:40:43 2022 +1100 pmdaopenmetrics: validate given names before using them for metrics Its possible pmdaopenmetrics is fed garbage accidentally, e.g. in the case where a /metrics end point is not made visible and an HTTP error response is returned (misconfiguration). Failure to safeguard this results in the generation of diagnostics in the openmetrics PMDA log file at an alarming rate until all available filesystem space is completely consumed. It can also result in bogus metric names being exposed to PMAPI clients. This commit both adds new safeguards and scales back diagnostic noise from the agent. In some places it switches to more appropriate APIs for the level of logging being performed (esp. debug messages). Resolves Red Hat BZ #2139325 diff -Naurp pcp-5.3.7.orig/qa/1191.out pcp-5.3.7/qa/1191.out --- pcp-5.3.7.orig/qa/1191.out 2021-11-01 13:02:26.000000000 +1100 +++ pcp-5.3.7/qa/1191.out 2023-03-09 11:49:42.372413973 +1100 @@ -10884,13 +10884,6 @@ GC Wall Time for b5be5b9f_b0f1_47de_b436 inst [0 or "0 collector_name:Copy"] value 24462465 inst [1 or "1 collector_name:MSC"] value 49519 -openmetrics.vmware_exporter.[root@ci-vm-10-0-138-196 [vmware_host_net_usage_average] - Data Type: double InDom: PM_INDOM_NULL 0xffffffff - Semantics: instant Units: none -Help: -vmware_host_net_usage_average -Error: Try again. Information not currently available - openmetrics.vmware_exporter.vmware_datastore_accessible [VMWare datastore accessible (true / false)] Data Type: double InDom: 144.35859 0x24008c13 Semantics: instant Units: none diff -Naurp pcp-5.3.7.orig/qa/1221.out pcp-5.3.7/qa/1221.out --- pcp-5.3.7.orig/qa/1221.out 2021-11-01 13:02:26.000000000 +1100 +++ pcp-5.3.7/qa/1221.out 2023-03-09 11:49:42.382414004 +1100 @@ -7105,9 +7105,6 @@ openmetrics.thermostat.tms_jvm_gc_b5be5b inst [0 or "0 collector_name:Copy"] labels {"agent":"openmetrics","collector_name":"Copy","domainname":DOMAINNAME,"groupid":NUM,"hostname":HOSTNAME,"instname":"0 collector_name:Copy","machineid":MACHINEID,"source":"thermostat","url":FILEURL,"userid":NUM} inst [1 or "1 collector_name:MSC"] labels {"agent":"openmetrics","collector_name":"MSC","domainname":DOMAINNAME,"groupid":NUM,"hostname":HOSTNAME,"instname":"1 collector_name:MSC","machineid":MACHINEID,"source":"thermostat","url":FILEURL,"userid":NUM} -openmetrics.vmware_exporter.[root@ci-vm-10-0-138-196 - labels {"agent":"openmetrics","domainname":DOMAINNAME,"groupid":NUM,"hostname":HOSTNAME,"machineid":MACHINEID,"source":"vmware_exporter","url":FILEURL,"userid":NUM} - openmetrics.vmware_exporter.vmware_datastore_accessible labels {"agent":"openmetrics","dc_name":"ha-datacenter","domainname":DOMAINNAME,"ds_cluster":"","ds_name":"name008","groupid":NUM,"hostname":HOSTNAME,"machineid":MACHINEID,"source":"vmware_exporter","url":FILEURL,"userid":NUM} inst [0 or "0 dc_name:ha-datacenter ds_cluster: ds_name:name026"] labels {"agent":"openmetrics","dc_name":"ha-datacenter","domainname":DOMAINNAME,"ds_cluster":"","ds_name":"name026","groupid":NUM,"hostname":HOSTNAME,"instname":"0 dc_name:ha-datacenter ds_cluster: ds_name:name026","machineid":MACHINEID,"source":"vmware_exporter","url":FILEURL,"userid":NUM} @@ -15331,9 +15328,6 @@ openmetrics.thermostat.tms_jvm_gc_b5be5b inst [0 or "0 collector_name:Copy"] labels {"agent":"openmetrics","collector_name":"Copy","domainname":DOMAINNAME,"groupid":NUM,"hostname":HOSTNAME,"instname":"0 collector_name:Copy","machineid":MACHINEID,"source":"thermostat","url":FILEURL,"userid":NUM} inst [1 or "1 collector_name:MSC"] labels {"agent":"openmetrics","collector_name":"MSC","domainname":DOMAINNAME,"groupid":NUM,"hostname":HOSTNAME,"instname":"1 collector_name:MSC","machineid":MACHINEID,"source":"thermostat","url":FILEURL,"userid":NUM} -openmetrics.vmware_exporter.[root@ci-vm-10-0-138-196 - labels {"agent":"openmetrics","domainname":DOMAINNAME,"groupid":NUM,"hostname":HOSTNAME,"machineid":MACHINEID,"source":"vmware_exporter","url":FILEURL,"userid":NUM} - openmetrics.vmware_exporter.vmware_datastore_accessible labels {"agent":"openmetrics","dc_name":"ha-datacenter","domainname":DOMAINNAME,"ds_cluster":"","ds_name":"name008","groupid":NUM,"hostname":HOSTNAME,"machineid":MACHINEID,"source":"vmware_exporter","url":FILEURL,"userid":NUM} inst [0 or "0 dc_name:ha-datacenter ds_cluster: ds_name:name026"] labels {"agent":"openmetrics","dc_name":"ha-datacenter","domainname":DOMAINNAME,"ds_cluster":"","ds_name":"name026","groupid":NUM,"hostname":HOSTNAME,"instname":"0 dc_name:ha-datacenter ds_cluster: ds_name:name026","machineid":MACHINEID,"source":"vmware_exporter","url":FILEURL,"userid":NUM} diff -Naurp pcp-5.3.7.orig/qa/1342 pcp-5.3.7/qa/1342 --- pcp-5.3.7.orig/qa/1342 2021-02-17 15:27:41.000000000 +1100 +++ pcp-5.3.7/qa/1342 2023-03-09 11:49:42.382414004 +1100 @@ -99,6 +99,8 @@ echo == Note: check $seq.full for log en echo == pmdaopenmetrics LOG == >>$here/$seq.full cat $PCP_LOG_DIR/pmcd/openmetrics.log >>$here/$seq.full +# cleanup preparing for remove (else error messages here) +$sudo rm $PCP_PMDAS_DIR/openmetrics/config.d/simple_metric.* _pmdaopenmetrics_remove # success, all done diff -Naurp pcp-5.3.7.orig/qa/1727 pcp-5.3.7/qa/1727 --- pcp-5.3.7.orig/qa/1727 2021-09-23 14:19:12.000000000 +1000 +++ pcp-5.3.7/qa/1727 2023-03-09 11:49:42.382414004 +1100 @@ -2,6 +2,7 @@ # PCP QA Test No. 1727 # Test duplicate instname labels in /metrics webapi when a context # level label such as "hostname" is explicitly specified. +# Test invalid OpenMetrics metric names also. # # Copyright (c) 2021 Red Hat. All Rights Reserved. # @@ -54,8 +55,17 @@ echo '# TYPE somemetric gauge' echo 'somemetric{hostname="$MYHOST"} 1234' EOF -chmod 755 $tmp.script +cat <$tmp.badxml +#! /bin/bash + +cat $here/sadist/891688-dash-time.xml +echo '# TYPE somemetric gauge' +echo 'somemetric{hostname="$MYHOST"} 1234' +EOF + +chmod 755 $tmp.script $tmp.badxml $sudo mv $tmp.script $PCP_PMDAS_DIR/openmetrics/config.d/duplicate_instname_label.sh +$sudo mv $tmp.badxml $PCP_PMDAS_DIR/openmetrics/config.d/invalid_metrics_badinput.sh _pmdaopenmetrics_install if ! _pmdaopenmetrics_wait_for_metric openmetrics.control.calls @@ -68,6 +78,14 @@ echo; echo === /metrics webapi listing. curl -Gs 'http://localhost:44322/metrics?names=openmetrics.duplicate_instname_label.somemetric' \ | _filter_openmetrics_labels +echo; echo === verify metric name validity using pminfo +pminfo -v openmetrics + +# squash errors for a clean uninstall +$sudo rm $PCP_PMDAS_DIR/openmetrics/config.d/invalid_metrics_badinput.sh +# capture openmetrics log for posterity +cat $PCP_LOG_DIR/pmcd/openmetrics.log >> $here/$seq.full + _pmdaopenmetrics_remove # success, all done diff -Naurp pcp-5.3.7.orig/qa/1727.out pcp-5.3.7/qa/1727.out --- pcp-5.3.7.orig/qa/1727.out 2021-09-13 14:40:08.000000000 +1000 +++ pcp-5.3.7/qa/1727.out 2023-03-09 11:49:42.382414004 +1100 @@ -8,6 +8,8 @@ QA output created by 1727 # TYPE openmetrics_duplicate_instname_label_somemetric gauge openmetrics_duplicate_instname_label_somemetric{hostname=HOSTNAME,instid="0",instname="0 hostname:HOSTNAME",domainname=DOMAINNAME,machineid=MACHINEID,source="duplicate_instname_label"} 1234 +=== verify metric name validity using pminfo + === remove openmetrics agent === Culling the Performance Metrics Name Space ... openmetrics ... done diff -Naurp pcp-5.3.7.orig/qa/openmetrics/samples/vmware_exporter.txt pcp-5.3.7/qa/openmetrics/samples/vmware_exporter.txt --- pcp-5.3.7.orig/qa/openmetrics/samples/vmware_exporter.txt 2021-11-01 13:02:26.000000000 +1100 +++ pcp-5.3.7/qa/openmetrics/samples/vmware_exporter.txt 2023-03-09 11:49:42.382414004 +1100 @@ -1035,25 +1035,6 @@ vmware_host_net_errorsTx_summation{clust # HELP vmware_host_net_usage_average vmware_host_net_usage_average # TYPE vmware_host_net_usage_average gauge vmware_host_net_usage_average{cluster_name="",dc_name="ha-datacenter",host_name="SOMEHOSTNAME"} 3.0 -[root@ci-vm-10-0-138-196 ~]# curl -Gs http://localhost:9272/metrics > curl.txt -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# -[root@ci-vm-10-0-138-196 ~]# cat curl.txt # HELP vmware_vm_power_state VMWare VM Power state (On / Off) # TYPE vmware_vm_power_state gauge vmware_vm_power_state{cluster_name="",dc_name="ha-datacenter",ds_name="name026",host_name="SOMEHOSTNAME",vm_name="name023-sat61-client"} 0.0 diff -Naurp pcp-5.3.7.orig/src/pmdas/openmetrics/pmdaopenmetrics.python pcp-5.3.7/src/pmdas/openmetrics/pmdaopenmetrics.python --- pcp-5.3.7.orig/src/pmdas/openmetrics/pmdaopenmetrics.python 2022-04-05 09:05:43.000000000 +1000 +++ pcp-5.3.7/src/pmdas/openmetrics/pmdaopenmetrics.python 2023-03-09 11:49:42.382414004 +1100 @@ -122,7 +122,7 @@ class Metric(object): # c_units = c_api.pmUnits_int(self.munits.dimSpace, self.munits.dimTime, self.munits.dimCount, # self.munits.scaleSpace, self.munits.scaleTime, self.munits.scaleCount) - self.source.pmda.log("Metric: adding new metric %s pmid=%s type=%d sem=%d singular=%s mindom=0x%x labels=%s" % + self.source.pmda.debug("Metric: adding metric %s pmid=%s type=%d sem=%d singular=%s mindom=0x%x labels=%s" % (name, pmContext.pmIDStr(self.pmid), self.mtype, self.msem, self.singular, self.mindom, self.labels)) self.obj = pmdaMetric(self.pmid, self.mtype, self.mindom, self.msem, self.munits) @@ -565,6 +565,7 @@ class Source(object): self.cluster = cluster # unique/persistent id# for nickname self.path = path # pathname to .url or executable file self.url = None + self.parse_error = False self.parse_url_time = 0 # timestamp of config file when it was last parsed self.is_scripted = is_scripted self.pmda = thispmda # the shared pmda @@ -659,6 +660,19 @@ class Source(object): self.pmda.log("instname_labels returning %s" % naming_labels) if self.pmda.dbg else None return naming_labels + def valid_metric_name(self, name): + ''' + Check validity of given metric name component: + - only contains alphanumerics and/or underscores + (colon removed later, allowed through here); + - only starts with an alphabetic character. + ''' + if not name[0].isalpha(): + return False + if not re.sub('[_.:]', '', name).isalnum(): + return False + return True + def parse_metric_line(self, line, pcpline, helpline, typeline): ''' Parse the sample line, identify/create corresponding metric & instance. @@ -687,8 +701,10 @@ class Source(object): # Nb: filter pattern is applied only to the leaf component of the full metric name if self.check_filter(sp.name, "METRIC") != "INCLUDE": self.pmda.log("Metric %s excluded by config filters" % fullname) - return + return True else: + if not self.valid_metric_name(sp.name): + raise ValueError('invalid metric name: ' + sp.name) # new metric metricnum = self.pmids_table.intern_lookup_value(sp.name) # check if the config specifies metadata for this metric @@ -708,24 +724,32 @@ class Source(object): else: m.store_inst(naming_labels, sp.value) self.pmda.set_notify_change() + except ValueError as e: + if not self.parse_error: + self.pmda.err("cannot parse name in %s: %s" % (line, e)) + self.parse_error = True # one-time-only error diagnostic + return False except Exception as e: + if self.pmda.dbg: + traceback.print_exc() # traceback can be handy here self.pmda.err("cannot parse/store %s: %s" % (line, e)) - traceback.print_exc() # traceback can be handy here - + return False + return True def parse_lines(self, text): - '''Refresh all the metric metadata as it is found, including creating - new metrics. Store away metric values for subsequent - fetch()es. Parse errors may result in exceptions. - That's OK, we don't try heroics to parse non-compliant - data. Return number of metrics extracted. ''' - + Refresh all the metric metadata as it is found, including creating + new metrics. Store away metric values for subsequent fetch()es. + Input parse errors result in exceptions and early termination. + That's OK, we don't try heroics to parse non-compliant data. + Return number of metrics extracted. + ''' num_metrics = 0 lines = text.splitlines() pcpline = None helpline = None typeline = None + badness = False state = "metadata" for line in lines: self.pmda.debug("line: %s state: %s" % (line, state)) if self.pmda.dbg else None @@ -762,9 +786,15 @@ class Source(object): # NB: could verify helpline/typeline lp[2] matches, # but we don't have to go out of our way to support # non-compliant exporters. - self.parse_metric_line(l, pcpline, helpline, typeline) + if not self.parse_metric_line(l, pcpline, helpline, typeline): + badness = True + break # bad metric line, skip the remainder of this file num_metrics += 1 + # clear one-time-only error diagnostic if the situation is now resolved + if not badness: + self.parse_error = False + # NB: this logic only ever -adds- Metrics to a Source. If a source # stops supplying some metrics, then a PCP app will see a PM_ERR_INST # coming back when it tries to fetch them. We could perhaps keep the @@ -842,7 +872,7 @@ class Source(object): metadata = line.split(' ')[1:] self.metadatalist.append(metadata) else: - self.pmda.log('Warning: %s ignored unrecognised config entry "%s"' % (self.url, line)) + self.pmda.err('%s ignored unrecognised config entry "%s"' % (self.url, line)) self.pmda.debug("DEBUG url: %s HEADERS: %s" % (self.url, self.headers)) if self.pmda.dbg else None self.pmda.debug("DEBUG url: %s FILTERS: %s" % (self.url, self.filterlist)) if self.pmda.dbg else None @@ -911,8 +941,7 @@ class Source(object): except Exception as e: self.pmda.stats_status[self.cluster] = 'failed to fetch URL or execute script %s: %s' % (self.path, e) self.pmda.stats_status_code[self.cluster] = status_code - self.pmda.debug('Warning: cannot fetch URL or execute script %s: %s' % (self.path, e)) if self.pmda.dbg else None - return + self.pmda.debug('cannot fetch URL or execute script %s: %s' % (self.path, e)) def refresh2(self, timeout): ''' @@ -950,7 +979,7 @@ class Source(object): self.pmda.log("fetch: item=%d inst=%d m.mname=%s" % (item, inst, m.mname)) if self.pmda.dbg else None return m.fetch_inst(inst) except Exception as e: - self.pmda.log("Warning: cannot fetch item %d inst %d: %s" % (item, inst, e)) + self.pmda.debug("cannot fetch item %d inst %d: %s" % (item, inst, e)) return [c_api.PM_ERR_AGAIN, 0] class OpenMetricsPMDA(PMDA): @@ -1468,7 +1497,7 @@ class OpenMetricsPMDA(PMDA): def debug(self, s): if self.dbg: - super(OpenMetricsPMDA, self).log("debug: " + s) + super(OpenMetricsPMDA, self).dbg(s) def log(self, message): PMDA.log(message) @@ -1526,8 +1555,9 @@ if __name__ == '__main__': if args.nosort: sort_conf_list = False - pmdadir = os.path.join(os.getenv('PCP_PMDAS_DIR'), args.root) if not args.config.startswith("/"): + pmdadir = os.getenv('PCP_PMDAS_DIR') or '/' + pmdadir = os.path.join(pmdadir, args.root) args.config = os.path.join(pmdadir, args.config) # This PMDA starts up in the "notready" state, see the Install script where