#!/usr/bin/env python3
#
# perfmon - a daemon for monitoring performance of the host on which it is run
# and of all the local VMs, and for generating events based on configurable
# triggers
#
# Notes:
# ======
# The XAPI instance running on localhost monitors a number of variables
# for each VM running locally (i.e not on other pool members) and
# for the host itself. Each variable is stored in 16 RRDs (Round Robin Databases).
#
#  Consolidation   Number of samples in RRD
#  function        5s/sample  1m/sample 1hr/sample 1day/sample
#  AVERAGE         120 (10m)  120 (2h)  ?          ?
#  MIN             120 (10m)  120 (2h)  ?          ?
#  MAX             120 (10m)  120 (2h)  ?          ?
#  LAST            120 (10m)  120 (2h)  ?          ?
#
# The "Consolidation function" tells how that RRD is built up from the
# one with the next highest sample rate.  E.g. In the 1m/sample "AVERAGE" RRD
# each sample is the average of 12 from the 1s/sample "AVERAGE" RRD, whereas
# in the 1m/sample "MIN" RRD each sample is the minimum of 12 from the 1s/sample
# "AVERAGE" RRD.
#
# When XAPI is queried over http it selects the column (e.g. "1hr/sample")
# based on the "start" CGI param.  It will return the highest level of granularity
# available for the period requested.
#
# The "cf" CGI param specifies the row. If it is not set, all rows are returned.

# pylint: disable=too-many-lines, missing-class-docstring
# pytype: disable=attribute-error

import subprocess
import gc
import getopt
import os
import random
import re
import signal
import socket
import sys
import syslog
import time
import traceback
import urllib.request

# used to parse rrd_updates because this may be large and sax is more efficient
from xml import sax

# used to parse other-config:perfmon.  Efficiency is less important than reliability here
from xml.dom import minidom # pytype: disable=pyi-error
from xml.parsers.expat import ExpatError

import XenAPI


def print_debug(string): # pragma: no cover
    if debug:
        print("DEBUG:", string, file=sys.stderr)
        syslog.syslog(syslog.LOG_USER | syslog.LOG_INFO, "PERFMON(DEBUG): %s" % string)


def log_err(string):
    print(string, file=sys.stderr)
    syslog.syslog(syslog.LOG_USER | syslog.LOG_ERR, "PERFMON: %s" % string)
    pass


def log_info(string): # pragma: no cover
    print(string, file=sys.stderr)
    syslog.syslog(syslog.LOG_INFO | syslog.LOG_INFO, "PERFMON: %s" % string)
    pass


def debug_mem(): # pragma: no cover
    objCount = {}
    gc.collect()
    objList = gc.get_objects()
    for obj in objList:
        if getattr(obj, "__class__", None):
            name = obj.__class__.__name__
        else:
            name = type(obj)
        if name in objCount:
            objCount[name] += 1
        else:
            objCount[name] = 1

        output = []
        for name, cnt in objCount.items():
            output.append("%s :%s" % (name, cnt))
    log_info("\n".join(output))


class PerfMonException(Exception):
    pass


class XmlConfigException(PerfMonException):
    pass


class UsageException(Exception):
    pass


class IncorrectInputException(Exception):
    pass

# Start a session with the master of a pool.
# Note: when calling http://localhost/rrd_update we must pass the session
# ID as a param.  The host then uses this to verify our validity with
# the master before responding.
# If the verification fails we should get a 401 response
class XapiSession(XenAPI.Session): # pragma: no cover
    """Object that represents a XenAPI session with the pool master
    One of these is needed to refresh a VMMonitor or HOSTMonitor config, or
    to refresh an RRDUpdates object
    """

    def __init__(self):
        XenAPI.Session.__init__(
            self, "http://_var_xapi_xapi", transport=XenAPI.UDSTransport()
        )
        self.xenapi.login_with_password("", "", "1.0", "xen-api-scripts-perfmon")

    def __del__(self):
        self.xenapi.session.logout()

    def id(self):
        return self._session


class ObjectReport:
    def __init__(self, objtype, uuid):
        self.objtype = (
            objtype  # a string like "vm", or "host" taken from an <entry> tag
        )
        self.uuid = uuid  # the object's uuid
        self.vars = {}  # maps rrd variable name to array of floats

    def get_uuid(self):
        return self.uuid

    def get_var_names(self):
        return list(self.vars.keys())

    def get_value(self, var_name, row):
        try:
            return (self.vars[var_name])[row]
        except Exception:
            return 0.0

    def insert_value(self, var_name, index, value):
        if var_name not in self.vars:
            self.vars[var_name] = []
        self.vars[var_name].insert(index, value)


# pylint: disable=too-few-public-methods
class RRDReport:
    "This is just a data structure passed that is completed by RRDContentHandler"

    def __init__(self):
        self.reset()

    def reset(self):
        self.columns = 0  # num xapi vars in xml
        self.rows = 0  # num samples in xml
        self.start_time = 0  # timestamp of 1st sample in xml
        self.end_time = 0  # timestamp of last sample in xml
        self.step_time = 0  # seconds between each pair of samples
        self.obj_reports = {}  # maps uuids to ObjectReports, built from xml


class RRDColumn:
    "class used internally by RRDContentHandler"

    def __init__(self, paramname, obj_report):
        self.paramname = paramname
        self.obj_report = obj_report


# pylint: disable=too-many-instance-attributes
class RRDContentHandler(sax.ContentHandler):
    """Handles data in this format:
    <xport>
      <meta>
       <start>INTEGER</start>
       <step>INTEGER</step>
       <end>INTEGER</end>
       <rows>INTEGER</rows>
       <columns>INTEGER</columns>
       <legend>
        <entry>IGNOREME:(host|vm):UUID:PARAMNAME</entry>
        ... another COLUMNS-1 entries ...
       </legend>
      </meta>
      <data>
       <row>
        <t>INTEGER(END_TIME)</t>
        <v>FLOAT</v>
        ... another COLUMNS-1 values ...
       </row>
       ... another ROWS-2 rows
       <row>
        <t>INTEGER(START_TIME)</t>
        <v>FLOAT</v>
        ... another COLUMNS-1 values ...
       </row>
      </data>
    </xport>
    """

    def __init__(self, report):
        '''
        report is saved and later updated by this object.
        report should contain defaults already
        '''
        super().__init__()
        self.report = report
        self.in_start_tag = False
        self.in_step_tag = False
        self.in_end_tag = False
        self.in_rows_tag = False
        self.in_columns_tag = False
        self.in_entry_tag = False
        self.in_row_tag = False
        self.column_details = []
        self.row = 0
        self.raw_text = ""
        self.col = 0
        self.in_t_tag = False
        self.in_v_tag = False

    def startElement(self, name, attrs):
        self.raw_text = ""
        if name == "start":
            self.in_start_tag = True
        elif name == "step":
            self.in_step_tag = True
        elif name == "end":
            self.in_end_tag = True
        elif name == "rows":
            self.in_rows_tag = True
        elif name == "columns":
            self.in_columns_tag = True
        elif name == "entry":
            self.in_entry_tag = True
        elif name == "row":
            self.in_row_tag = True
            self.col = 0

        if self.in_row_tag:
            if name == "t":
                self.in_t_tag = True
            elif name == "v":
                self.in_v_tag = True

    def characters(self, content):
        conditions = [
            self.in_start_tag,
            self.in_step_tag,
            self.in_end_tag,
            self.in_rows_tag,
            self.in_columns_tag,
            self.in_entry_tag,
            self.in_t_tag,
            self.in_v_tag
            # self.in_row_tag
            # ignore text under row tag, <row>s are just for holding <t> and <v> nodes
        ]
        if any(conditions):
            self.raw_text += content

    def endElement(self, name):
        if name == "start":
            # This overwritten later if there are any rows
            self.report.start_time = int(self.raw_text)
            self.in_start_tag = False
        elif name == "step":
            self.report.step_time = int(self.raw_text)
            self.in_step_tag = False
        elif name == "end":
            # This overwritten later if there are any rows
            self.report.end_time = int(self.raw_text)
            self.in_end_tag = False
        elif name == "rows":
            self.report.rows = int(self.raw_text)
            self.in_rows_tag = False
        elif name == "columns":
            self.report.columns = int(self.raw_text)
            self.in_columns_tag = False
        elif name == "entry":
            (_, objtype, uuid, paramname) = self.raw_text.split(":")
            # lookup the obj_report corresponding to this uuid, or create if it does not exist
            if uuid not in self.report.obj_reports:
                self.report.obj_reports[uuid] = ObjectReport(objtype, uuid)
            obj_report = self.report.obj_reports[uuid]

            # save the details of this column
            self.column_details.append(RRDColumn(paramname, obj_report))
            self.in_entry_tag = False
        elif name == "row":
            self.in_row_tag = False
            self.row += 1
        elif name == "t":
            # Extract start and end time from row data
            # as it's more reliable than the values in the meta data
            t = int(self.raw_text)
            # Last row corresponds to start time
            self.report.start_time = t
            if self.row == 0:
                # First row corresponds to end time
                self.report.end_time = t

            self.in_t_tag = False

        elif name == "v":
            v = float(self.raw_text)

            # Find object report and paramname for this col
            col_details = self.column_details[self.col]
            obj_report = col_details.obj_report
            paramname = col_details.paramname

            # Update object_report
            obj_report.insert_value(
                paramname, index=0, value=v
            )  # use index=0 as this is the earliest sample so far

            # Update position in row
            self.col += 1

            self.in_v_tag = False


# An object of this class should persist the lifetime of the program
class RRDUpdates:
    """Object used to get and parse the output the http://localhost/rrd_udpates?..."""

    def __init__(self):
        # params are what get passed to the CGI executable in the URL
        self.params = {}
        self.params["start"] = int(time.time()) - interval  # interval seconds ago
        self.params["host"] = "true"  # include data for host (as well as for VMs)
        self.params["sr_uuid"] = "all"  # include data for all SRs attached to this host
        self.params["cf"] = (
            "AVERAGE"  # consolidation function, each sample averages 12 from the 5 second RRD
        )
        self.params["interval"] = str(rrd_step)  # distinct from the perfmon interval
        self.report = RRDReport()  # data structure updated by RRDContentHandler

    def __repr__(self):
        return "<RRDUpdates object: params=%s>" % str(self.params)

    def refresh(self, session, override_params=None):
        "reread the rrd_updates over CGI and parse"
        params = {}
        if override_params is not None:
            params = override_params
        params["session_id"] = session.id()
        params.update(self.params)
        paramstr = "&".join(["%s=%s" % (k, params[k]) for k in params])
        print_debug("Calling http://localhost/rrd_updates?%s" % paramstr)

        url = "http://localhost/rrd_updates?%s" % paramstr
        with urllib.request.urlopen(url) as sock:
            xmlsource = sock.read().decode("utf-8")

        # Use sax rather than minidom and save Vvvast amounts of time and memory.
        self.report.reset()
        sax.parseString(xmlsource, RRDContentHandler(self.report))

        # Update the time used on the next run
        self.params["start"] = (
            self.report.end_time + 1
        )  # avoid retrieving same data twice

        print_debug(
            "Refreshed rrd_updates, start = %d, end = %d, rows = %d"
            % (self.report.start_time, self.report.end_time, self.report.rows)
        )

    def get_num_rows(self):
        "Return the number of samples of each parameter"
        return self.report.rows

    def get_obj_report_by_uuid(self, uuid):
        "Return an ObjectReport for the object with this uuid"
        try:
            return self.report.obj_reports[uuid]
        except Exception:
            return None

    def get_uuid_list_by_objtype(self, objtype):
        '''
        Return a list of uuids corresonding to the objects 
        of this type for which we have ObjectReports
        '''
        return [
            objrep.uuid
            for objrep in self.report.obj_reports.values()
            if objrep.objtype == objtype
        ]


# Consolidation functions:
supported_consolidation_functions = [
    "sum",
    "average",
    "max",
    "get_percent_fs_usage",
    "get_percent_log_fs_usage",
    "get_percent_mem_usage",
    "get_percent_sr_usage",
]


def average(mylist):
    if not mylist:
        log_err("Error in average, no input data, return 0.0 instead")
        return 0.0
    return sum(mylist) / float(len(mylist))


def get_percent_log_fs_usage(_):
    '''
    Get the percent usage of the host filesystem for logs partition.
    Input list is ignored and should be empty
    '''
    fs_output = subprocess.getoutput("df /etc/passwd")
    log_fs_output = subprocess.getoutput("df /var/log")
    fs_output = " ".join(fs_output.splitlines()[1:])
    log_fs_output = " ".join(log_fs_output.splitlines()[1:])
    # Get the percent usage only when there is a separate logs partition
    if fs_output.split()[0] != log_fs_output.split()[0]:
        percentage = log_fs_output.split()[4]
        # remove % character and convert to float
        return float(percentage[0:-1]) / 100.0
    else:
        return float("NaN")


def get_percent_fs_usage(_):
    '''
    Get the percent usage of the host filesystem.
    Input list is ignored and should be empty
    '''
    # this file is on the filesystem of interest in both OEM and Retail
    output = subprocess.getoutput("df /etc/passwd")
    output = " ".join(
        output.splitlines()[1:]
    )  # remove header line and rewrap on single line
    percentage = output.split()[4]
    # remove % character and convert to float
    return float(percentage[0:-1]) / 100.0


def get_percent_mem_usage(_):
    '''
    Get the percent usage of Dom0 memory/swap.
    Input list is ignored and should be empty
    '''
    try:
        with open("/proc/meminfo", "r", encoding="utf-8") as memfd:
            memlist = memfd.readlines()
        # memorylists is a list of lists, each list contains two parts: memtype and size
        memorylists = [m.split(":", 1) for m in memlist]
        memdict = {
            # pytype complained that No attribute 'group' on None
            # Let Exception catch the `not matched` issue and return 0.0
            k.strip(): float(re.search(r"\d+", v.strip()).group(0))
            for (k, v) in memorylists
        }
        # We consider the sum of res memory and swap in use as the hard demand
        # of mem usage, it is bad if this number is beyond the physical mem, as
        # in such case swapping is obligatory rather than voluntary, hence
        # degrading the performance. We define the percentage metrics as
        # (res_mem + swap_in_use) / phy_mem, which could potentially go beyond
        # 100% (but is considered bad when it does)
        mem_in_use = (
            memdict["MemTotal"]
            - memdict["MemFree"]
            - memdict["Buffers"]
            - memdict["Cached"]
        )
        swap_in_use = memdict["SwapTotal"] - memdict["SwapFree"]
        return float(mem_in_use + swap_in_use) / memdict["MemTotal"]
    except Exception as e:
        log_err("Error %s in get_percent_mem_usage, return 0.0 instead" % e)
        return 0.0


def get_percent_sr_usage(mylist):
    """
    Get the percent usage of the SR.
    Input list should be exactly two items: [physical_utilisation, size]
    """
    try:
        if len(mylist) != 2:
            raise IncorrectInputException(
                "Incorrect number of values to consolidate: %d (exactly 2 values)"
                % len(mylist)
            )
        physical_utilisation, size = mylist[0:2]
        return float(physical_utilisation) / size
    except Exception as e:
        log_err("Error %s in get_percent_sr_usage, return 0.0 instead" % e)
        return 0.0


# pylint: disable=too-few-public-methods
class VariableConfig:
    """Object storing the configuration of a Variable

    Initialisation parameters:
    xmldoc = dom object representing the <variable> nodes in the ObjectMonitor config strings.
            See VMMonitor.__doc__ and HOSTMonitor.__doc__
    alarm_create_callback =
            callback called by Variable.update() to create and send an alarm
    get_default_variable_config =
            a function that VariableConfig.__init__() uses to lookup default tag values
            by variable name
    """

    def __init__(self, xmldoc, alarm_create_callback, get_default_variable_config):
        try:
            name = xmldoc.getElementsByTagName("name")[0].getAttribute("value")
        except IndexError as e:
            raise XmlConfigException("variable missing 'name' tag") from e

        def get_value(tag):
            try:
                return xmldoc.getElementsByTagName(tag)[0].getAttribute("value")
            except Exception:
                return get_default_variable_config(name, tag)

        rrd_regex = get_value("rrd_regex")
        consolidation_fn = get_value("consolidation_fn")
        alarm_trigger_level = get_value("alarm_trigger_level")
        alarm_trigger_period = get_value("alarm_trigger_period")
        alarm_auto_inhibit_period = get_value("alarm_auto_inhibit_period")
        alarm_trigger_sense = get_value("alarm_trigger_sense")
        alarm_priority = get_value("alarm_priority")

        # Save xmldoc: we need this when creating the body of the alarms
        self.xmldoc = xmldoc

        self.name = name
        try:
            self.rrd_regex = re.compile("^%s$" % rrd_regex)
        except Exception as e:
            raise XmlConfigException(
                "variable %s: regex %s does not compile" % (name, rrd_regex)
            ) from e

        if consolidation_fn not in supported_consolidation_functions:
            raise XmlConfigException(
                "variable %s: consolidation function %s not supported"
                % (name, consolidation_fn)
            )
        # It's fine to use eval here
        # pylint: disable=eval-used
        self.consolidation_fn = eval(consolidation_fn)

        try:
            self.alarm_trigger_period = int(alarm_trigger_period)
        except Exception as e:
            raise XmlConfigException(
                "variable %s: alarm_trigger_period %s not an int"
                % (name, alarm_trigger_period)
            ) from e

        try:
            self.alarm_auto_inhibit_period = int(alarm_auto_inhibit_period)
        except Exception as e:
            raise XmlConfigException(
                "variable %s: alarm_auto_inhibit_period %s not an int"
                % (name, alarm_auto_inhibit_period)
            ) from e
        try:
            trigger_level = float(alarm_trigger_level)
        except Exception as e:
            raise XmlConfigException(
                "variable %s: alarm_trigger_level %s not a float"
                % (name, alarm_trigger_level)
            ) from e

        self.alarm_priority = alarm_priority

        if alarm_trigger_sense == "high":
            self.test_level = lambda: (self.value > trigger_level)
        else:
            self.test_level = lambda: (self.value < trigger_level)
        self.alarm_create_callback = alarm_create_callback


def variable_configs_differ(vc1, vc2):
    "Say whether configuration of one variable differs from that of another"
    return vc1.xmldoc.toxml() != vc2.xmldoc.toxml()


class VariableState:
    """Object storing the state of a Variable"""

    def __init__(self):
        self.value = None
        # Attributes `alarm_auto_inhibit_period` and `alarm_trigger_period` are defined
        # in VariableConfig, and Class Varialbe multiple inherit from
        # VariableConfig and VariableState
        self.timeof_last_alarm = time.time() - self.alarm_auto_inhibit_period
        self.trigger_down_counter = self.alarm_trigger_period


class Variable(VariableConfig, VariableState):
    """Variable() is used by ObjectMonitor to create one Variable object for each
    variable specified in it's config string
    """

    def __init__(self, *args):
        VariableConfig.__init__(self, *args)
        VariableState.__init__(self)
        self.active = True
        print_debug("Created Variable %s" % self.name)

    def set_active(self, active):
        print_debug(
            "set_active on %s. (old, new) = (%s, %s)" % (self.name, self.active, active)
        )
        if active == self.active:
            return  # nothing to do
        self.active = active
        if active:
            VariableState.__init__(self)  # reset when reactivating

    def __generate_alarm(self, session):
        """Generate an alarm using callback provided by creator

        ... provided that one has not been generated in the last
        self.alarm_auto_inhibit_period seconds
        """
        t = time.time()
        delta = t - self.timeof_last_alarm
        print_debug(
            "Time since last alarm for var %s is %d - %d = %d. Refractory period = %d."
            % (
                self.name,
                t,
                self.timeof_last_alarm,
                delta,
                self.alarm_auto_inhibit_period,
            )
        )
        if delta < self.alarm_auto_inhibit_period:
            return  # we are in the auto inhibit period - do nothing
        self.timeof_last_alarm = t
        message = "value: %f\nconfig:\n%s" % (self.value, self.xmldoc.toprettyxml())

        self.alarm_create_callback(self, session, message)

    def update(self, value, session):
        """Update the value of the variable using an RRDUpdates object

        Calls self.__generate_alarm() if level has been 'bad' for more than
        self.alarm_trigger_period seconds
        """
        self.value = value
        print_debug("Variable %s set to %f" % (self.name, value))
        if self.test_level():
            # level is bad
            self.trigger_down_counter -= rrd_step
            if self.trigger_down_counter <= 0:
                self.__generate_alarm(session)
                # reset trigger counter
                self.trigger_down_counter = self.alarm_trigger_period
        else:
            # level good - reset trigger counter
            self.trigger_down_counter = self.alarm_trigger_period


class ObjectMonitor:
    """Abstract class, used as base for VMMonitor and HOSTMonitor

    Public attributes are uuid, refresh_config()
    Inherited classes must implement a public attribute process_rrd_updates()
    """

    def __init__(self, uuid):
        self.uuid = uuid
        self.xmlconfig = None
        # "variables" is the public attribute of interest
        self.variables = []
        self.refresh_config()

    def refresh_config(self):
        if self.__update_xmlconfig():
            # config has changed - reparse it
            try:
                self.__parse_xmlconfig()
            except XmlConfigException as e:
                log_err(
                    "%s %s config error: %s" % (self.monitortype, self.uuid, str(e))
                )
            except ExpatError as e:
                log_err(
                    "%s %s XML parse error: %s" % (self.monitortype, self.uuid, str(e))
                )
            return True
        else:
            return False  # config unchanged

    def __update_xmlconfig(self):
        if self.uuid not in all_xmlconfigs:
            xmlconfig = None
        else:
            xmlconfig = all_xmlconfigs[self.uuid]
        changed = False
        if xmlconfig != self.xmlconfig:
            self.xmlconfig = xmlconfig
            changed = True
        return changed

    def __parse_xmlconfig(self):
        if not self.xmlconfig:
            # Possible if this VM/host is not configured yet
            self.variables = []
            return
        xmldoc = minidom.parseString(self.xmlconfig)
        variable_nodes = xmldoc.getElementsByTagName("variable")
        variable_names = []

        for vn in variable_nodes:
            # create a variable using the config in vn
            var = Variable(vn, self.alarm_create, self.get_default_variable_config)

            # Update list of variable names
            if var.name not in variable_names:
                variable_names.append(var.name)

            # build list of variables already present with same name
            vars_with_same_name = [v for v in self.variables if v.name == var.name]
            count = 0
            append_var = True
            for v in vars_with_same_name:
                # this list should be 0 or 1 long!
                if count > 0:
                    log_err(
                        "programmer error: found duplicate variable %s (uuid %s)"
                        % (var.name, self.uuid)
                    )
                    self.variables.remove(v)
                    continue
                count += 1

                # only replace variable in self.variables if its config has changed.
                # This way we don't reset its state
                if variable_configs_differ(var, v):
                    self.variables.remove(v)
                else:
                    append_var = False

            if append_var:
                print_debug(
                    "Appending %s to list of variables for %s UUID=%s"
                    % (var.name, self.monitortype, self.uuid)
                )
                self.variables.append(var)

        # Now delete any old variables that do not appear in the new variable_nodes
        variables_to_remove = [
            v for v in self.variables if v.name not in variable_names
        ]
        for v in variables_to_remove:
            print_debug(
                "Deleting %s from list of variables for UUID=%s" % (v.name, self.uuid)
            )
            self.variables.remove(v)

    def get_active_variables(self):
        return self.variables

    def process_rrd_updates(self, rrd_updates, session):
        print_debug(
            "%sMonitor processing rrd_updates for %s" % (self.monitortype, self.uuid)
        )
        obj_report = rrd_updates.get_obj_report_by_uuid(self.uuid)
        num_rows = rrd_updates.get_num_rows()
        if not obj_report:
            return
        params_in_obj_report = obj_report.get_var_names()

        for var in self.get_active_variables():
            # find the subset of the params returned for this object
            # that we need to consolidate into var
            params_to_consolidate = list(
                filter(var.rrd_regex.match, params_in_obj_report)
            )
            for row in range(num_rows):
                # Get the values to consolidate
                values_to_consolidate = [
                    obj_report.get_value(param, row) for param in params_to_consolidate
                ]
                # Consolidate them
                value = var.consolidation_fn(values_to_consolidate)
                # Pass result on to the variable object
                # This may result in an alarm being generated
                var.update(value, session)

    def alarm_create(self, var, session, message):
        "Callback used by Variable var to actually send an alarm"
        print_debug(
            "Creating an alarm for %s %s, message: %s"
            % (self.monitortype, self.uuid, message)
        )
        session.xenapi.message.create(
            "ALARM", var.alarm_priority, self.monitortype, self.uuid, message
        )


class VMMonitor(ObjectMonitor):
    """Object that maintains state of one VM
    
    Configured by writing an xml string into an other-config key, e.g.
    xe vm-param-set uuid=$vmuuid other-config:perfmon=\
       '<config><variable><name value="cpu_usage"/>
       <alarm_trigger_level value="0.5"/></variable></config>'

    Notes:
     - Multiple <variable> nodes allowed
     - full list of child nodes is
       * name: what to call the variable (no default)
       * alarm_priority: the priority of the messages generated (default '3')
       * alarm_trigger_level: level of value that triggers an alarm (no default)
       * alarm_trigger_sense:
            'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high')
       * alarm_trigger_period:
            num seconds of 'bad' values before an alarm is sent (default '60')
       * alarm_auto_inhibit_period:
            num seconds this alarm disabled after an alarm is sent (default '3600')
       * consolidation_fn:
            how to combine variables from rrd_updates into one value
            (default is 'average' for 'cpu_usage', 'get_percent_fs_usage' for 'fs_usage',
            'get_percent_log_fs_usage' for 'log_fs_usage',
            'get_percent_mem_usage' for 'mem_usage', & 'sum' for everything else)
       * rrd_regex matches the names of variables
         from (xe vm-data-sources-list uuid=$vmuuid) used to compute value
         (only has defaults for "cpu_usage", "network_usage", and "disk_usage")
    """

    def __init__(self, *args):
        self.monitortype = "VM"
        ObjectMonitor.__init__(self, *args)
        print_debug("Created VMMonitor with uuid %s" % self.uuid)

    def get_default_variable_config(self, variable_name, config_tag):
        "This allows user to not specify full set of tags for each variable in xml config"
        if config_tag == "consolidation_fn":
            if variable_name == "cpu_usage":
                return "average"
            elif variable_name == "fs_usage":
                return "get_percent_fs_usage"
            elif variable_name == "log_fs_usage":
                return "get_percent_log_fs_usage"
            elif variable_name == "mem_usage":
                return "get_percent_mem_usage"
            else:
                return "sum"
        elif config_tag == "rrd_regex":
            if variable_name == "cpu_usage":
                return "cpu[0-9]+"
            elif variable_name == "network_usage":
                return "vif_[0-9]+_[rt]x"
            elif variable_name == "disk_usage":
                return "vbd_(xvd|hd)[a-z]+_(read|write)"
            elif variable_name == "fs_usage":
                return "_$_DUMMY__"  # match nothing
            elif variable_name == "log_fs_usage":
                return "_$_DUMMY__"  # match nothing
            elif variable_name == "mem_usage":
                return "_$_DUMMY__"  # match nothing
            elif variable_name == "memory_internal_free":
                return variable_name
            else:
                raise XmlConfigException(
                    "variable %s: no default rrd_regex - please specify one"
                    % variable_name
                )
        elif config_tag == "alarm_trigger_period":
            return "60"  # 1 minute
        elif config_tag == "alarm_auto_inhibit_period":
            return "3600"  # 1 hour
        elif config_tag == "alarm_trigger_level":
            if variable_name == "fs_usage":
                return "0.9"  # trigger when 90% full
            elif variable_name == "log_fs_usage":
                return "0.9"  # trigger when 90% full
            elif variable_name == "mem_usage":
                return "0.95"  # tigger when mem demanded is close to phy_mem
            else:
                raise XmlConfigException(
                    "variable %s: no default alarm_trigger_level - please specify one"
                    % variable_name
                )
        elif config_tag == "alarm_trigger_sense":
            if variable_name == "memory_internal_free":
                return "low"
            else:
                return "high"  # trigger if *above*
        elif config_tag == "alarm_priority":
            return "3"  # Service degradation level defined in PR-1455
        else:
            raise XmlConfigException(
                "variable %s: no default available for tag %s"
                % (variable_name, config_tag)
            )


class SRMonitor(ObjectMonitor):
    """Object that maintains state of one SR

    Configured by writing an xml string into an other-config key, e.g.
    xe sr-param-set uuid=$vmuuid other-config:perfmon=\
       '<config><variable><name value="physical_utilisation"/>
       <alarm_trigger_level value="0.8"/></variable></config>'

    Notes:
     - Multiple <variable> nodes allowed
     - full list of child nodes is
       * name: what to call the variable (no default)
       * alarm_priority: the priority of the messages generated (default '3')
       * alarm_trigger_level: level of value that triggers an alarm (no default)
       * alarm_trigger_sense:
            'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high')
       * alarm_trigger_period:
            num seconds of 'bad' values before an alarm is sent (default '60')
       * alarm_auto_inhibit_period:
            num seconds this alarm disabled after an alarm is sent (default '3600')
       * consolidation_fn:
            how to combine variables from rrd_updates into one value
            (default is 'get_percent_sr_usage' for 'physical_utilistation',
            & 'sum' for everything else)
       * rrd_regex matches the names of variables
         from (xe sr-data-sources-list uuid=$sruuid) used to compute value
         (has default for "physical_utilistaion")
    """

    def __init__(self, *args):
        self.monitortype = "SR"
        ObjectMonitor.__init__(self, *args)
        print_debug("Created SRMonitor with uuid %s" % self.uuid)

    def get_default_variable_config(self, variable_name, config_tag):
        "This allows user to not specify full set of tags for each variable in xml config"
        if config_tag == "consolidation_fn":
            if variable_name == "physical_utilisation":
                return "get_percent_sr_usage"
            else:
                return "sum"
        elif config_tag == "rrd_regex":
            if variable_name == "physical_utilisation":
                return "physical_utilisation|size"
            elif variable_name == "sr_io_throughput_total_per_host":
                # (these are to drive Host RRDs and so are handled by the HOSTMonitor)
                return "_$_DUMMY__"
            else:
                raise XmlConfigException(
                    "variable %s: no default rrd_regex - please specify one"
                    % variable_name
                )
        elif config_tag == "alarm_trigger_period":
            return "60"  # 1 minute
        elif config_tag == "alarm_auto_inhibit_period":
            return "3600"  # 1 hour
        elif config_tag == "alarm_trigger_level":
            if variable_name == "physical_utilistaion":
                return "0.8"  # trigger when 80% full
            else:
                raise XmlConfigException(
                    "variable %s: no default alarm_trigger_level - please specify one"
                    % variable_name
                )
        elif config_tag == "alarm_trigger_sense":
            return "high"  # trigger if *above*
        elif config_tag == "alarm_priority":
            return "3"  # Service degradation level defined in PR-1455
        else:
            raise XmlConfigException(
                "variable %s: no default available for tag %s"
                % (variable_name, config_tag)
            )


class HOSTMonitor(ObjectMonitor):
    """Object that maintains state of one Host
    
    Configured by writing an xml string into an other-config key, e.g.
    xe host-param-set uuid=$hostuuid other-config:perfmon=\
       '<config><variable><name value="cpu_usage"/>
       <alarm_trigger_level value="0.5"/></variable></config>'

    Notes:
     - Multiple <variable> nodes allowed
     - full list of child nodes is
       * name: what to call the variable (no default)
       * alarm_priority: the priority of the messages generated (default '3')
       * alarm_trigger_level: level of value that triggers an alarm (no default)
       * alarm_trigger_sense:
            'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high')
       * alarm_trigger_period:
            num seconds of 'bad' values before an alarm is sent (default '60')
       * alarm_auto_inhibit_period:
            num seconds this alarm disabled after an alarm is sent (default '3600')
       * consolidation_fn: how to combine variables from rrd_updates into one value
            (default is 'average' for 'cpu_usage' & 'sum' for everything else)
       * rrd_regex matches the names of variables
         from (xe host-data-source-list uuid=$hostuuid) used to compute value
         (only has defaults for "cpu_usage", "network_usage", "memory_free_kib"
         and "sr_io_throughput_total_xxxxxxxx"
         where that last one ends with the first eight characters of the SR uuid)

    Also, as a special case for SR throughput, it is possible to configure a Host by
    writing xml into the other-config key of an SR connected to it, e.g.
    xe sr-param-set uuid=$sruuid other-config:perfmon=\
       '<config><variable><name value="sr_io_throughput_total_per_host"/>
       <alarm_trigger_level value="0.01"/></variable></config>
       
    This only works for that one specific variable-name,
    and rrd_regex must not be specified.
    Configuration done on the host directly
    (variable-name sr_io_throughput_total_xxxxxxxx) takes priority.
    """

    def __init__(self, *args):
        self.monitortype = "Host"
        self.secondary_variables = set()
        self.secondary_xmlconfigs = {}  # map of sr uuid to xml text
        ObjectMonitor.__init__(self, *args)
        print_debug("Created HOSTMonitor with uuid %s" % self.uuid)

    def get_default_variable_config(self, variable_name, config_tag):
        "This allows user to not specify full set of tags for each variable in xml config"
        if config_tag == "consolidation_fn":
            if variable_name == "cpu_usage":
                return "average"
            else:
                return "sum"
        elif config_tag == "rrd_regex":
            if variable_name == "cpu_usage":
                return "cpu[0-9]+"
            elif variable_name == "network_usage":
                return "pif_eth[0-9]+_[rt]x"
            elif variable_name == "memory_free_kib":
                return variable_name
            elif re.match("sr_io_throughput_total_[0-9a-f]{8}$", variable_name):
                return variable_name[3:]
            else:
                raise XmlConfigException(
                    "variable %s: no default rrd_regex - please specify one"
                    % variable_name
                )
        elif config_tag == "alarm_trigger_period":
            return "60"  # 1 minute
        elif config_tag == "alarm_auto_inhibit_period":
            return "3600"  # 1 hour
        elif config_tag == "alarm_trigger_sense":
            if variable_name == "memory_free_kib":
                return "low"
            else:
                return "high"  # trigger if *above* level
        elif config_tag == "alarm_priority":
            return "3"  # Service degradation level defined in PR-1455
        else:
            raise XmlConfigException(
                "variable %s: no default available for tag %s"
                % (variable_name, config_tag)
            )

    def get_active_variables(self):
        r = self.variables + [v for v in self.secondary_variables if v.active]
        print_debug(
            "Returning active variables: %d main, %d total"
            % (len(self.variables), len(r))
        )
        return r

    def refresh_config(self):
        main_changed = ObjectMonitor.refresh_config(self)

        # Now handle any extra config from SRs.
        # This functionality makes this file inelegant but means that it is
        # possible to set up an alarm on each host that uses an SR by setting
        # appropriate configuration in the SR's other-config.
        if self.uuid not in sruuids_by_hostuuid:
            print_debug("%s not in sruuids_by_hostuuid" % self.uuid)
            self.secondary_variables.clear()
            self.secondary_xmlconfigs.clear()
            return

        secondary_changed = False
        old_sruuids = set(self.secondary_xmlconfigs)  # create set of keys
        current_sruuids = sruuids_by_hostuuid[self.uuid]  # a set already
        if old_sruuids != current_sruuids:
            print_debug("Changed set of perfmon sruuids for host %s" % self.uuid)
            secondary_changed = True
        else:
            for sruuid in sruuids_by_hostuuid[self.uuid]:
                sr_xmlconfig = all_xmlconfigs[sruuid]
                # As an optimisation, if xml unchanged then do not re-parse.
                # Otherwise we would create Variables which would
                # turn out to be same as existing ones so we would ignore them.
                if (
                    sruuid in self.secondary_xmlconfigs
                    and self.secondary_xmlconfigs[sruuid] == sr_xmlconfig
                ):
                    print_debug("Unchanged sr_xmlconfig for sruuid %s" % sruuid)
                else:
                    print_debug(
                        "Found new/different sr_xmlconfig for sruuid %s" % sruuid
                    )
                    secondary_changed = True
                    break

        if secondary_changed:
            try:
                self.__parse_secondary_xmlconfigs()
            except XmlConfigException as e:
                log_err(
                    "%s %s secondary config error: %s"
                    % (self.monitortype, self.uuid, str(e))
                )
            except ExpatError as e:
                log_err(
                    "%s %s secondary XML parse error: %s"
                    % (self.monitortype, self.uuid, str(e))
                )

        if main_changed or secondary_changed:
            # Calculate which secondary variables are active,
            # i.e. not overridden by ones configured on the host rather than the SR.
            main_names = {v.name for v in self.variables}
            for v in self.secondary_variables:
                v.set_active(v.name not in main_names)

    def __parse_secondary_xmlconfigs(self):
        variable_names = (
            set()
        )  # Names of the Variable objects we create based on the xml nodes we find
        self.secondary_xmlconfigs.clear()
        for sruuid in sruuids_by_hostuuid[self.uuid]:
            print_debug("Looking for config on SR uuid %s" % sruuid)
            sr_xmlconfig = all_xmlconfigs[sruuid]
            self.secondary_xmlconfigs[sruuid] = sr_xmlconfig
            xmldoc = minidom.parseString(sr_xmlconfig)
            variable_nodes = xmldoc.getElementsByTagName("variable")
            found = False
            for vn in variable_nodes:
                try:
                    name_element = vn.getElementsByTagName("name")[0]
                    name = name_element.getAttribute("value")
                except IndexError:
                    log_err(
                        "variable missing 'name' tag in perfmon xml config of SR %s"
                        % sruuid
                    )
                    continue  # perhaps other nodes are valid
                print_debug(
                    "Found variable with name %s on SR uuid %s" % (name, sruuid)
                )
                if name != "sr_io_throughput_total_per_host":
                    continue  # Do nothing unless the variable is meant for the host
                if len(vn.getElementsByTagName("rrd_regex")) > 0:
                    log_err(
                        "Configuration error:" \
                        "rrd_regex must not be specified in config on SR meant for each host"
                    )
                    continue  # perhaps another node is valid
                if found:
                    log_err(
                        "Configuration error: duplicate variable %s on SR %s"
                        % (name, sruuid)
                    )
                    # A host can only have one Variable from a given SR
                    # since we only accept one kind (one name).
                    break
                found = True
                name_override = "sr_io_throughput_total_%s" % sruuid[0:8]
                name_element.setAttribute("value", name_override)
                provenance_element = xmldoc.createElement("configured_on")
                provenance_element.setAttribute("class", "SR")
                provenance_element.setAttribute("uuid", sruuid)
                vn.appendChild(provenance_element)
                var = Variable(vn, self.alarm_create, self.get_default_variable_config)
                variable_names.add(var.name)
                append_var = True
                vars_with_same_name = [
                    v for v in self.secondary_variables if v.name == var.name
                ]
                for v in vars_with_same_name:
                    # this list should be 0 or 1 long!
                    # only replace variable in self.secondary_variables if its config has changed.
                    # This way we don't reset its state
                    if variable_configs_differ(var, v):
                        print_debug(
                            "Removing existing secondary variable to replace with new: %s"
                            % v.name
                        )
                        self.secondary_variables.remove(v)
                    else:
                        print_debug(
                            "Found existing secondary variable with same config: %s"
                            % v.name
                        )
                        append_var = False
                if append_var:
                    print_debug(
                        "Adding %s to set of secondary variables for host UUID=%s"
                        % (var.name, self.uuid)
                    )
                    self.secondary_variables.add(var)

        # Now that we have read all the xml items,
        # delete any old variables that do not appear in the new variable_nodes
        print_debug(
            "Going to delete any secondary_variables not in %s" % variable_names
        )
        variables_to_remove = [
            v for v in self.secondary_variables if v.name not in variable_names
        ]
        for v in variables_to_remove:
            print_debug(
                "Deleting %s from set of secondary variables for UUID=%s"
                % (v.name, self.uuid)
            )
            self.secondary_variables.remove(v)


all_xmlconfigs = {}
sruuids_by_hostuuid = (
    {}
)  # Maps host uuid to a set of the uuids of the host's SRs that have other-config:perfmon


def update_all_xmlconfigs(session):
    """Update all_xmlconfigs, a global dictionary that maps any uuid
    (SR, host or VM) to the xml config string in other-config:perfmon keys
    and update sruuids_by_hostuuid which together with all_xmlconfigs allows
    lookup of the other-config:perfmon xml of the SRs connected to a host"""
    # `all_xmlconfigs` and `sruuids_by_hostuuid` are updated by clear() and update()
    # pylint: disable=global-variable-not-assigned
    global all_xmlconfigs
    global sruuids_by_hostuuid

    all_host_recs = session.xenapi.host.get_all_records()
    all_vm_recs = session.xenapi.VM.get_all_records()
    all_sr_recs = session.xenapi.SR.get_all_records()

    # build dictionary mapping uuids to other_configs
    all_otherconfigs = {}

    for recs in (all_host_recs, all_vm_recs, all_sr_recs):
        all_otherconfigs.update(
            [(recs[ref]["uuid"], recs[ref]["other_config"]) for ref in recs]
        )

    # rebuild dictionary mapping uuids to xmlconfigs
    all_xmlconfigs.clear()
    all_xmlconfigs.update(
        [
            (uuid, other_config["perfmon"])
            for (uuid, other_config) in all_otherconfigs.items()
            if "perfmon" in other_config
        ]
    )

    # Rebuild another map
    sruuids_by_hostuuid.clear()
    for _, rec in all_sr_recs.items():
        if "perfmon" in rec["other_config"]:
            sruuid = rec["uuid"]
            # If we hadn't done SR.get_all_records we would now do SR.get_PBDs.
            host_refs = [session.xenapi.PBD.get_host(pbd) for pbd in rec["PBDs"]]
            host_uuids = [all_host_recs[ref]["uuid"] for ref in host_refs]
            for hu in host_uuids:
                if hu in sruuids_by_hostuuid:
                    sruuids_by_hostuuid[hu].add(sruuid)
                else:
                    sruuids_by_hostuuid[hu] = {sruuid}


# 5 minute default interval
interval = 300
interval_percent_dither = 5
rrd_step = 60
debug = False

# rate to call update_all_xmlconfigs()
config_update_period = 1800

# an af_unix socket name (the "\0" stops socket.bind() creating a fs node)
cmdsockname = "\0perfmon"
cmdmaxlen = 256

# pylint: disable=global-statement
def main(): # pragma: no cover
    global interval
    global interval_percent_dither
    global rrd_step
    global debug
    global config_update_period
    maxruns = None
    try:
        argv = sys.argv[1:]
        opts, _ = getopt.getopt(
            argv,
            "i:n:ds:c:D:",
            [
                "interval=",
                "numloops=",
                "debug",
                "rrdstep=",
                "config_update_period=",
                "interval_percent_dither=",
            ],
        )
    except getopt.GetoptError as e:
        raise UsageException from e

    for opt, arg in opts:
        if opt in ("-i", "--interval"):
            interval = int(arg)
        elif opt in ("-n", "--numloops"):
            maxruns = int(arg)
        elif opt in ("-d", "--debug"):
            debug = True
        elif opt in ("-s", "--rrdstep"):
            rrd_step = int(arg)
            if rrd_step not in (5, 60):
                raise UsageException
        elif opt in ("-c", "--config_update_period"):
            config_update_period = int(arg)
        elif opt in ("-D", "--interval_percent_dither"):
            interval_percent_dither = int(arg)
        else:
            raise UsageException

    # open the cmd socket (over which we listen for commands such as "refresh")
    cmdsock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
    cmdsock.bind(cmdsockname)

    # The dither on each loop (prevents stampede on master)
    rand = random.Random().uniform
    dither = (interval * interval_percent_dither) / 100.0

    # Create a XAPI session on first run
    restart_session = True

    # Create a client for getting the rrd_updates over HTTP
    rrd_updates = RRDUpdates()

    # Work out when next to update all the xmlconfigs for all the
    # hosts and all the VMs.  This causes a lot of data to be retrieved
    # from the master, so we only do it once every config_update_period
    # and we cache the results
    next_config_update = time.time()

    # monitors for vms running on this host.
    # This dictionary uses uuids to lookup each monitor object
    vm_mon_lookup = {}

    # monitors for srs plugged on this host
    # This dictionary uses uuids to lookup each monitor object
    sr_mon_lookup = {}

    # The monitor for the host
    host_mon = None

    runs = 0
    while True:
        print_debug("Run: %d" % runs)

        # Get new updates - and catch any http errors
        try:
            # if session has failed on last run we need to restart it
            if restart_session:
                session = XapiSession()
                restart_session = False

            rrd_updates.refresh(session)

            # Should we update all_xmlconfigs
            if time.time() >= next_config_update:
                print_debug("Updating all_xmlconfigs")
                # yes - update all the xml configs:
                # this generates a few LARGE xapi messages from the master
                update_all_xmlconfigs(session)

                # Set time when to do this next
                next_config_update = time.time() + config_update_period

            # List of VMs present in rrd_updates
            vm_uuid_list = rrd_updates.get_uuid_list_by_objtype("vm")

            # Remove any monitors for VMs no longer listed in rrd_updates page
            # We use .pop() inside the loop, use list(dict_var.keys()):
            for uuid in list(vm_mon_lookup.keys()):
                if uuid not in vm_uuid_list:
                    vm_mon_lookup.pop(uuid)

            # Create monitors for VMs that have just appeared in rrd_updates page
            for uuid in vm_uuid_list:
                if uuid not in vm_mon_lookup:
                    vm_mon_lookup[uuid] = VMMonitor(uuid)
                else:
                    # check if the config has changed, e.g. by XenCenter
                    vm_mon_lookup[uuid].refresh_config()

            # Remove monitor for the host if it's no longer listed in rrd_updates page
            # Create monitor for the host if it has just appeared in rrd_updates page
            try:
                host_uuid = rrd_updates.get_uuid_list_by_objtype("host")[
                    0
                ]  # should only ever be one of these
            except Exception:
                # list may be empty!
                host_uuid = None

            if not host_uuid:
                host_mon = None
            elif not host_mon:
                host_mon = HOSTMonitor(host_uuid)
            elif host_mon.uuid != host_uuid:
                raise PerfMonException(
                    "host uuid in rrd_updates changed (old: %s, new %s)"
                    % (host_mon.uuid, host_uuid)
                )
            else:
                # check if the config has changed, e.g. by XenCenter
                host_mon.refresh_config()

            # List of SRs present in rrd_updates
            sr_uuid_list = rrd_updates.get_uuid_list_by_objtype("sr")
            print_debug("sr_uuid_list = %s" % sr_uuid_list)

            # Remove monitors for SRs no longer listed in the rrd_updates page
            # We use .pop() inside the loop, use list(dict_var.keys()):
            for uuid in list(sr_mon_lookup.keys()):
                if uuid not in sr_uuid_list:
                    sr_mon_lookup.pop(uuid)
            # Create monitors for SRs that have just appeared in rrd_updates page
            for uuid in sr_uuid_list:
                if uuid not in sr_mon_lookup:
                    sr_mon_lookup[uuid] = SRMonitor(uuid)
                else:
                    sr_mon_lookup[uuid].refresh_config()

            # Go through each vm_mon and update it using the rrd_udpates
            # this may generate alarms
            for vm_mon in vm_mon_lookup.values():
                vm_mon.process_rrd_updates(rrd_updates, session)

            # Ditto for the host_mon
            if host_mon:
                host_mon.process_rrd_updates(rrd_updates, session)

            # And for the sr_mons
            for sr_mon in sr_mon_lookup.values():
                sr_mon.process_rrd_updates(rrd_updates, session)

        except ConnectionRefusedError as e:
            # "Connection refused[111]"
            # this happens when we try to restart session and *that* fails
            time.sleep(2)
            log_err(
                "caught connection refused error: (%s) - restarting XAPI session"
                % str(e)
            )
            restart_session = True
        except urllib.error.HTTPError as e:
            if e.code in (401, 500):
                # Error getting rrd_updates: 401=Unauthorised, 500=Internal
                # start new session
                log_err("caught http.error: (%s) - restarting XAPI session" % str(e))
                restart_session = True
            else:
                # Don't know why we got this error - crash, die and look at logs later
                raise
        except OSError as e:
            # This happens if we send messages or
            # read other-config:perfmon after xapi is restarted
            log_err("caught connection error: (%s) - restarting XAPI session" % str(e))
            restart_session = True

        runs += 1
        if maxruns is not None and runs >= maxruns:
            break

        # Force collection of cyclically referenced objects cos we don't
        # trust GC to do it on its own
        gc.collect()

        # Sleep for interval + dither, exiting early if we recv a cmd
        timeout = rand(interval, interval + dither)
        cmdsock.settimeout(timeout)
        try:
            cmd = cmdsock.recv(cmdmaxlen).decode()
        except socket.timeout:
            pass
        else:
            if cmd == "refresh":
                # This forces a re-read of all the configs on the next loop
                next_config_update = time.time()
            elif cmd == "debug_mem":
                debug_mem()
            else:
                log_err("received unhandled command %s" % cmd)

        # continue to next run

    return 0


def sigterm_handler(sig, _): # pragma: no cover
    log_err("Caught signal %d - exiting" % sig)
    sys.exit(1)


pidfile = "/var/run/perfmon.pid"

if __name__ == "__main__": # pragma: no cover

    # setup signal handler to print out notice when killed
    signal.signal(signal.SIGTERM, sigterm_handler)

    if "--daemon" in sys.argv[1:]:
        sys.argv.remove("--daemon")
        if os.fork() != 0:
            sys.exit(0)
        os.setsid()
        # For /dev/null, `encoding` and `with` is not needed
        # pylint: disable=unspecified-encoding, consider-using-with
        sys.stdout = open("/dev/null", "w")
        sys.stdin = open("/dev/null", "r")
        sys.stderr = sys.stdout

    # Exit if perfmon already running
    if os.path.exists(pidfile):
        with open(pidfile, encoding="utf-8") as file:
            pid = file.read()

        if os.path.exists("/proc/%s" % pid):
            log_err("perfmon already running - exiting")
            sys.exit(3)

    try:
        # Write out pidfile
        with open(pidfile, "w", encoding="utf-8") as fd:
            fd.write("%d" % os.getpid())

        # run the main loop
        rc = main()

    except UsageException:
        # Print the usage
        log_err(
            "usage: %s [-i <interval> -n <loops> -d -s <rrd_step> -c" \
                "<config_update_period> -D <interval_percent_dither>] \\\n"
            "\t[--interval=<interval> --numloops=<loops> --debug \\\n"
            "\t --rrdstep=<rrd_step> --daemon]\n"
            "\t --config_update_period=<config_update_period>\n"
            "\t --interval_percent_dither=<interval_percent_dither>\n"
            "  interval:\tseconds between reads of http://localhost/rrd_updates?...\n"
            "  loops:\tnumber of times to run before exiting\n"
            "  rrd_step:\tseconds between samples provided by rrd_updates." \
                "  Valid values are 5 or 60\n"
            "  config_update_period:\tseconds between getting updates" \
                " of all VM/host records from master\n"
            "  interval_percent_dither:\tmax percent dither in each loop" \
                " - prevents stampede on master\n"
            % (sys.argv[0])
        )
        rc = 1

    except SystemExit:
        # we caught a signal which we have already logged
        pass

    except Exception as exp:
        rc = 2
        log_err("FATAL ERROR: perfmon will exit")
        log_err("Exception is of class %s" % exp.__class__)
        ex = sys.exc_info()
        err = traceback.format_exception(*ex)

        # XenAPI.Failure has `details`.
        try:
            # print the exception args nicely
            log_err(str(exp))
        except Exception:
            try:
                err_msg = "\n".join([str(x) for x in exp.details])
                # print the exception args nicely
                log_err(err_msg)
            except Exception:
                pass

        # now log the traceback to syslog
        for exline in err:
            log_err(exline)

    # remove pidfile and exit
    os.unlink(pidfile)
    sys.exit(rc)
