File: //lib/python3/dist-packages/sos/cleaner/mappings/hostname_map.py
# Copyright 2020 Red Hat, Inc. Jake Hunsaker <jhunsake@redhat.com>
# This file is part of the sos project: https://github.com/sosreport/sos
#
# This copyrighted material is made available to anyone wishing to use,
# modify, copy, or redistribute it subject to the terms and conditions of
# version 2 of the GNU General Public License.
#
# See the LICENSE file in the source distribution for further information.
import re
from sos.cleaner.mappings import SoSMap
class SoSHostnameMap(SoSMap):
    """Mapping store for hostnames and domain names
    Hostnames are obfuscated using an incrementing counter based on the total
    number of hosts matched regardless of domain name.
    Domain names are obfuscated based on the host's hostname, plus any user
    defined domains passed in by the `--domains` options.
    Domains are obfuscated as whole units, meaning the domains 'example.com'
    and 'host.foo.example.com' will be separately obfuscated with no relation
    for example as 'obfuscatedomdain1.com' and 'obfuscatedomain2.com'.
    Top-level domains are left untouched.
    """
    ignore_matches = [
        'localhost',
        '.*localdomain.*',
        '^com..*'
    ]
    skip_keys = [
        'www',
        'api'
    ]
    strip_exts = ('.yaml', '.yml', '.crt', '.key', '.pem', '.log', '.repo',
                  '.rules', '.conf', '.cfg')
    ignore_short_items = True
    match_full_words_only = True
    host_count = 0
    domain_count = 0
    _domains = {}
    hosts = {}
    def load_domains_from_map(self):
        """Because we use 'intermediary' dicts for host names and domain names
        in this parser, we need to re-inject entries from the map_file into
        these dicts and not just the underlying 'dataset' dict
        """
        for domain, ob_pair in self.dataset.items():
            if len(domain.split('.')) == 1:
                self.hosts[domain.split('.')[0]] = self.dataset[domain]
            else:
                if ob_pair.startswith('obfuscateddomain'):
                    # directly exact domain matches
                    self._domains[domain] = ob_pair.split('.')[0]
                    continue
                # strip the host name and trailing top-level domain so that
                # we in inject the domain properly for later string matching
                # note: this is artificially complex due to our stance on
                # preserving TLDs. If in the future the project decides to
                # obfuscate TLDs as well somehow, then this will all become
                # much simpler
                _domain_to_inject = '.'.join(domain.split('.')[1:-1])
                if not _domain_to_inject:
                    continue
                for existing_domain, value in self.dataset.items():
                    _existing = '.'.join(existing_domain.split('.')[:-1])
                    if _existing == _domain_to_inject:
                        _ob_domain = '.'.join(value.split('.')[:-1])
                        self._domains[_domain_to_inject] = _ob_domain
        self.set_initial_counts()
    def get_regex_result(self, item):
        """Override the base get_regex_result() to provide a regex that, if
        this is an FQDN or a straight domain, will include an underscore
        formatted regex as well.
        """
        if '.' in item:
            item = item.replace('.', '(\\.|_)')
        return super().get_regex_result(item)
    def set_initial_counts(self):
        """Set the initial counter for host and domain obfuscation numbers
        based on what is already present in the mapping.
        """
        # hostnames/short names
        try:
            h = sorted(self.hosts.values(), reverse=True)[0].split('host')[1]
            self.host_count = int(h) + 1
        except IndexError:
            # no hosts loaded yet
            pass
        # domain names
        try:
            d = sorted(self._domains.values(), reverse=True)[0].split('domain')
            self.domain_count = int(d[1].split('.')[0]) + 1
        except IndexError:
            # no domains loaded yet
            pass
    def domain_name_in_loaded_domains(self, domain):
        """Check if a potential domain is in one of the domains we've loaded
        and should be obfuscated
        """
        if domain in self._domains:
            return True
        host = domain.split('.')
        no_tld = '.'.join(domain.split('.')[0:-1])
        if len(host) == 1:
            # don't block on host's shortname
            return host[0] in self.hosts
        if any(no_tld.endswith(_d) for _d in self._domains):
            return True
        return False
    def get(self, item):
        # pylint: disable=too-many-branches
        prefix = ''
        suffix = ''
        final = None
        # The regex pattern match may include a leading and/or trailing '_'
        # character due to the need to use word boundary matching, so we need
        # to strip these from the string during processing, but still keep them
        # in the returned string to not mangle the string replacement in the
        # context of the file or filename
        while item.startswith(('.', '_')):
            prefix += item[0]
            item = item[1:]
        while item.endswith(('.', '_')):
            suffix += item[-1]
            item = item[0:-1]
        if item in self.dataset:
            return self.dataset[item]
        if not self.domain_name_in_loaded_domains(item.lower()):
            # no match => return the original string with optional
            # leading/trailing '.' or '_' characters
            return ''.join([prefix, item, suffix])
        if item.endswith(self.strip_exts):
            ext = '.' + item.split('.')[-1]
            item = item.replace(ext, '')
            suffix += ext
        if item not in self.dataset:
            # try to account for use of '-' in names that include hostnames
            # and don't create new mappings for each of these
            for _existing in sorted(self.dataset.keys(), reverse=True,
                                    key=len):
                _host_substr = False
                _test = item.split(_existing)
                _h = _existing.split('.')
                # avoid considering a full FQDN match as a new match off of
                # the hostname of an existing match
                if _h[0] and _h[0] in self.hosts:
                    _host_substr = True
                if len(_test) == 1 or not _test[0]:
                    # does not match existing obfuscation
                    continue
                if not _host_substr and (_test[0].endswith('.') or
                                         item.endswith(_existing)):
                    # new hostname in known domain
                    final = super().get(item)
                    break
                if item.split(_test[0]):
                    # string that includes existing FQDN obfuscation substring
                    # so, only obfuscate the FQDN part
                    try:
                        itm = item.split(_test[0])[1]
                        final = _test[0] + super().get(itm)
                        break
                    except Exception:
                        # fallback to still obfuscating the entire item
                        pass
        if not final:
            final = super().get(item)
        return prefix + final + suffix
    def sanitize_item(self, item):
        host = item.split('.')
        if len(host) == 1:
            # we have a shortname for a host
            return self.sanitize_short_name(host[0].lower())
        if len(host) == 2:
            # we have just a domain name, e.g. example.com
            dname = self.sanitize_domain(host)
            if all(h.isupper() for h in host):
                dname = dname.upper()
            return dname
        if len(host) > 2:
            # we have an FQDN, e.g. foo.example.com
            hostname = host[0]
            domain = host[1:]
            # obfuscate the short name
            if len(hostname) > 2:
                ob_hostname = self.sanitize_short_name(hostname.lower())
            else:
                # by best practice it appears the host part of the fqdn was cut
                # off due to some form of truncating, as such don't obfuscate
                # short strings that are likely to throw off obfuscation of
                # unrelated bits and paths
                ob_hostname = 'unknown'
            ob_domain = self.sanitize_domain(domain)
            self.dataset[item] = ob_domain
            _fqdn = '.'.join([ob_hostname, ob_domain])
            if all(h.isupper() for h in host):
                _fqdn = _fqdn.upper()
            return _fqdn
        return None
    def sanitize_short_name(self, hostname):
        """Obfuscate the short name of the host with an incremented counter
        based on the total number of obfuscated host names
        """
        if not hostname or hostname in self.skip_keys:
            return hostname
        if hostname not in self.dataset:
            ob_host = f"host{self.host_count}"
            self.hosts[hostname] = ob_host
            self.host_count += 1
            self.dataset[hostname] = ob_host
            self.add_regex_item(hostname)
        return self.dataset[hostname]
    def sanitize_domain(self, domain):
        """Obfuscate the domainname, broken out into subdomains. Top-level
        domains are ignored.
        """
        for _skip in self.ignore_matches:
            # don't obfuscate vendor domains
            if re.match(_skip, '.'.join(domain)):
                return '.'.join(domain)
        top_domain = domain[-1].lower()
        dname = '.'.join(domain[0:-1]).lower()
        ob_domain = self._new_obfuscated_domain(dname)
        ob_domain = '.'.join([ob_domain, top_domain])
        self.dataset['.'.join(domain)] = ob_domain
        return ob_domain
    def _new_obfuscated_domain(self, dname):
        """Generate an obfuscated domain for each subdomain name given
        """
        if dname not in self._domains:
            self._domains[dname] = f"obfuscateddomain{self.domain_count}"
            self.domain_count += 1
        return self._domains[dname]