streamlogs2fluentd/index.py

#!/usr/bin/env python
import base64
import requests
import gzip
import json
import msgpack
import struct
import os
import shutil
import re
import logging
import time
import io
import urllib
import datetime
import boto3
import botocore

__author__ = "Stefan Reimer"
__author_email__ = "stefan@zero-downtime.net"
__version__ = "0.9.9"

# IAM Alias lookup cache
account_aliases = {}

# ENI lookup cache
enis = {}

# IP lookup cache
ips = {}

logger = logging.getLogger(__name__)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger('boto3').setLevel(logging.WARNING)
logging.getLogger('botocore').setLevel(logging.WARNING)


def boolean(value):
    if value in ('t', 'T', 'true', 'True', 'TRUE', '1', 1, True):
        return True
    return False


def decrypt(encrypted):
    try:
        kms = boto3.client('kms')
        plaintext = kms.decrypt(CiphertextBlob=base64.b64decode(encrypted))['Plaintext']
        return plaintext.decode()
    except Exception:
        logging.exception("Failed to decrypt via KMS")


CHUNK_SIZE = 128
DEBUG = boolean(os.getenv('DEBUG', default=False))
TEST = boolean(os.getenv('TEST', default=False))
RESOLVE_ACCOUNT = boolean(os.getenv('RESOLVE_ACCOUNT', default=True))
ENHANCE_FLOWLOG = boolean(os.getenv('ENHANCE_FLOWLOG', default=True))

if DEBUG:
    logging.getLogger().setLevel(logging.DEBUG)
else:
    logging.getLogger().setLevel(logging.INFO)


# From fluent/fluent-logger-python
class EventTime(msgpack.ExtType):
    def __new__(cls, timestamp):
        seconds = int(timestamp)
        nanoseconds = int(timestamp % 1 * 10 ** 9)
        return super(EventTime, cls).__new__(
            cls,
            code=0,
            data=struct.pack(">II", seconds, nanoseconds),
        )


def fluentd_time(timestamp):
    if isinstance(timestamp, float):
        return EventTime(timestamp)
    else:
        return int(timestamp)


def get_source(region, account_id):
    """ returns a new base source object
        resolves aws account_id to account alias and caches for lifetime of lambda function
    """
    global RESOLVE_ACCOUNT
    source = {'account': account_id, 'region': region}
    if RESOLVE_ACCOUNT and not TEST:
        try:
            if account_id not in account_aliases:
                boto3_config = botocore.config.Config(retries=dict(max_attempts=2), connect_timeout=3, read_timeout=5)
                iam = boto3.client('iam', config=boto3_config)
                account_aliases[account_id] = iam.list_account_aliases()['AccountAliases'][0]

            source['account_alias'] = account_aliases[account_id]

        except(botocore.exceptions.ConnectTimeoutError, KeyError, IndexError):
            logger.warning("Could not resolve IAM account alias, disabled for this session")
            RESOLVE_ACCOUNT = False
            pass

    return source


def add_flow_metadata(flow):
    """ adds metadata to VPC flow: ENI, direction, type
        caches the ENI and IP lookup tables for Lambda lifetime
    """
    global ENHANCE_FLOWLOG
    if ENHANCE_FLOWLOG and not TEST:
        try:
            # Check cache and update if missed with all ENIs in one go
            if flow['interface-id'] not in enis:
                boto3_config = botocore.config.Config(retries=dict(max_attempts=2), connect_timeout=3, read_timeout=5)
                ec2 = boto3.client('ec2', config=boto3_config)
                interface_iter = ec2.get_paginator('describe_network_interfaces').paginate()
                for response in interface_iter:
                    for interface in response['NetworkInterfaces']:
                        # Lookup table by ENI ID
                        enis[interface['NetworkInterfaceId']] = interface

                        # Lookup table by IP to classify traffic
                        ips[interface['PrivateIpAddress']] = interface
        except(botocore.exceptions.ConnectTimeoutError, KeyError, IndexError):
            logger.warning("Error trying to get metadata for ENIs, disabling ENHANCE_FLOWLOG")
            ENHANCE_FLOWLOG = False
            return flow

        try:
            eni = enis[flow['interface-id']]
            metadata = {'eni.az': eni['AvailabilityZone'],
                        'eni.subnet': eni['SubnetId']}
            remote_ip = None
            if len(eni['Groups']):
                metadata['eni.sg'] = eni['Groups'][0]['GroupName']

            # Add PublicIP if attached
            if 'Association' in eni and 'PublicIp' in eni['Association']:
                metadata['eni.public_ip'] = eni['Association']['PublicIp']

            # Determine traffic direction
            if eni['PrivateIpAddress'] == flow['srcaddr']:
                metadata['direction'] = 'Out'
                remote_ip = flow['dstaddr']
            elif eni['PrivateIpAddress'] == flow['dstaddr']:
                metadata['direction'] = 'In'
                remote_ip = flow['srcaddr']

            # Try to classify traffic:
            # Free,Regional,Out
            if remote_ip:
                if remote_ip in ips:
                    if ips[remote_ip]['AvailabilityZone'] == eni['AvailabilityZone'] and ips[remote_ip]['VpcId'] == eni['VpcId']:
                        metadata['traffic_class'] = 'Free'
                    else:
                        metadata['traffic_class'] = 'Regional'
                else:
                    # Incoming traffic is free 90% of times
                    if metadata['direction'] == 'In':
                        metadata['traffic_class'] = 'Free'
                    else:
                        metadata['traffic_class'] = 'Out'

            flow.update(metadata)

        except(KeyError, IndexError) as e:
            logger.warning("Could not get additional data for ENI {} ({})".format(flow['interface-id'], e))
            pass

    return flow


class Queue:
    url = urllib.parse.urlsplit(os.getenv('FLUENTD_URL', default=''), scheme='https')
    passwd = os.getenv('FLUENT_SHARED_KEY', default=None)

    verify_certs = os.getenv('FLUENTD_VERIFY_CERTS', default=1)
    if verify_certs in ('f', 'F', 'false', 'False', 'FALSE', '0', 0, False):
        verify_certs = False
    else:
        verify_certs = True

    # cached request session
    request = requests.Session()
    request.headers = {"Content-type": "application/msgpack"}
    if passwd:
        request.auth = ("fluent", passwd)

    def __init__(self, tag):
        self._queue = []
        self.tag = tag
        self.sent = 0

    def send(self, event):
        self._queue.append(event)
        logger.debug("Queued {} event: {}".format(self.tag, event))
        # Send events in chunks
        if len(self._queue) >= CHUNK_SIZE:
            self.flush()

    def flush(self):
        events = len(self._queue)
        if not events:
            return

        logger.debug("Sending {} events to {}/{} ({})".format(events, self.url.geturl(), self.tag, self.request))

        if not TEST:
            # Send events via POSTs reusing the same https connection, retry couple of times
            retries = 0
            _url = '{}/{}'.format(self.url.geturl(), self.tag)
            while True:
                try:
                    r = self.request.post(url=_url, data=msgpack.packb(self._queue), verify=self.verify_certs, timeout=(6, 30))
                    if r:
                        break
                    else:
                        logger.warning("HTTP Error: {}".format(r.status_code))

                except requests.RequestException as e:
                    logger.warning("RequestException: {}".format(e))
                    pass

                if retries >= 2:
                    raise Exception("Error sending {} events to {}. Giving up.".format(events, _url))

                retries = retries + 1
                time.sleep(1)
        else:
            logger.debug("Test mode, dump only: {}".format(msgpack.packb(self._queue)))

        self.sent = self.sent + events
        self._queue = []

    def info(self):
        logger.info("Sent {} events to {}/{} ({})".format(self.sent, self.url.geturl(), self.tag, self.request))


# Handler to handle CloudWatch logs.
def handler(event, context):
    logger.debug("Event received: {}".format(event))

    (region, account_id) = context.invoked_function_arn.split(":")[3:5]

    # Cloudwatch Logs event
    if 'awslogs' in event:
        # Grab the base64-encoded data.
        b64strg = event['awslogs']['data']

        # Decode base64-encoded string, which should be a gzipped object.
        zippedContent = io.BytesIO(base64.b64decode(b64strg))

        # Decompress the content and load JSON.
        with gzip.GzipFile(mode='rb', fileobj=zippedContent) as content:
            for line in content:
                awsLogsData = json.loads(line.decode())

        # First determine type
        if re.match("/aws/lambda/", awsLogsData['logGroup']):
            logs = Queue("aws.lambda")
        elif re.search("cloudtrail", awsLogsData['logGroup'], flags=re.IGNORECASE):
            logs = Queue("aws.cloudtrail")
        elif re.match("RDSOSMetrics", awsLogsData['logGroup']):
            logs = Queue("aws.rdsosmetrics")
        elif re.match("vpcflowlog", awsLogsData['logGroup'], flags=re.IGNORECASE):
            logs = Queue("aws.vpcflowlog")
        else:
            logs = Queue("aws.cloudwatch_logs")

        # Build list of log events
        for e in awsLogsData['logEvents']:
            event = {}
            source = get_source(region, account_id)
            parsed = {}

            # Remove whitespace / empty events & skip over empty events
            e['message'] = e['message'].strip()
            if re.match(r'^\s*$', e['message']):
                continue

            # inject existing data from subscrition filters
            if('extractedFields' in e.keys()):
                for key in e['extractedFields']:
                    event[key] = e['extractedFields'][key]

            # lambda ?
            if logs.tag == 'aws.lambda':
                # First look for the three AWS Lambda entries
                mg = re.match(r'(?P<type>(START|END|REPORT)) RequestId: (?P<request>\S*)', e['message'])
                if mg:
                    parsed['RequestId'] = mg.group('request')
                    if mg.group('type') == 'REPORT':
                        pattern = r'.*(?:\tDuration: (?P<duration>[\d\.\d]+) ms\s*)(?:\tBilled Duration: (?P<billed_duration>[\d\.\d]+) ms\s*)(?:\tMemory Size: (?P<memory_size>[\d\.\d]+) MB\s*)(?:\tMax Memory Used: (?P<max_memory_used>[\d\.\d]+) MB)(?:\tInit Duration: (?P<init_duration>[\d\.\d]+) ms\s*)?'

                    elif mg.group('type') == 'START':
                        pattern = r'.*(?:Version: (?P<version>.*))'

                    else:
                        pattern = ''

                    data = re.match(pattern, e['message'])
                    for key in data.groupdict().keys():
                        if data.group(key):
                            parsed[key] = data.group(key)

                    # All other info parsed, so just set type itself
                    event['message'] = mg.group('type')

                else:
                    # Try to extract data from AWS default python logging format
                    # This normalizes print vs. logging entries and allows requestid tracking
                    # "[%(levelname)s]\t%(asctime)s.%(msecs)dZ\t%(aws_request_id)s\t%(message)s\n"
                    _msg = e['message']
                    pattern = r'(?:\[(?P<level>[^\]]*)\]\s)?(?P<time>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{1,6}Z)\s(?P<RequestId>\S*?)\s(?P<message>.*)'
                    data = re.match(pattern, e['message'], flags=re.DOTALL)
                    if data:
                        if data.group('level'):
                            event['level'] = data.group('level')
                        event['time'] = fluentd_time(datetime.datetime.strptime(data.group('time'), '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())
                        parsed['RequestId'] = data.group('RequestId')
                        _msg = data.group('message')

                    # try to parse the remaining as json
                    try:
                        _json = json.loads(_msg)
                        # Make sure we have an actual object assigned to json field
                        if isinstance(_json, dict):
                            event['message_json'] = _json
                        else:
                            event['message'] = _json
                    except (ValueError, TypeError, KeyError):
                        event['message'] = _msg

            # cloudtrail ?
            elif logs.tag == 'aws.cloudtrail':
                try:
                    parsed = json.loads(e['message'])

                    # use eventTime and eventID from the event itself
                    event['time'] = fluentd_time(datetime.datetime.strptime(parsed['eventTime'], '%Y-%m-%dT%H:%M:%SZ').timestamp())
                    event['id'] = parsed['eventID']
                    # override region from cloudtrail event
                    source['region'] = parsed['awsRegion']

                except (ValueError, TypeError, KeyError):
                    event['message'] = e['message']
                    parsed.clear()

            # RDS metrics ?
            elif logs.tag == 'aws.rdsosmetrics':
                try:
                    parsed = json.loads(e['message'])

                except (ValueError, TypeError, KeyError):
                    event['message'] = e['message']

            # VPC FlowLog ?
            # <version> <account-id> <interface-id> <srcaddr> <dstaddr> <srcport> <dstport> <protocol> <packets> <bytes> <start> <end> <action> <log-status>
            elif logs.tag == 'aws.vpcflowlog':
                row = e['message'].split(" ")

                # Skip over NODATA,SKIPDATA entries, what would be the point having these in ES ?
                if row[13] != 'OK':
                    continue

                parsed = add_flow_metadata({'interface-id': row[2], 'srcaddr': row[3], 'dstaddr': row[4], 'srcport': row[5], 'dstport': row[6], 'protocol': row[7],
                                            'packets': row[8], 'bytes': row[9], 'start': row[10], 'end': row[11], 'action': row[12], 'log-status': row[13]})

            # Fallback add raw message
            else:
                event['message'] = e['message']

            if parsed and logs.tag:
                event[logs.tag] = parsed

            # Forward cloudwatch logs event ID
            source['log_group'] = awsLogsData['logGroup']
            source['log_stream'] = awsLogsData['logStream']
            event['source'] = source

            # If time and id are not set yet use data from cloudwatch logs event
            if 'time' not in event:
                event['time'] = fluentd_time(e['timestamp'] / 1000)
            if 'id' not in source:
                event['id'] = e['id']

            logs.send(event)

        logs.flush()
        logs.info()

    # S3 Put event
    elif 'Records' in event:
        s3_client = boto3.client('s3')

        bucket = event['Records'][0]['s3']['bucket']['name']
        key = event['Records'][0]['s3']['object']['key']

        file_path = '/tmp/stream2fluentd.gz'
        if TEST:
            shutil.copyfile(key, file_path)
        else:
            s3_client.download_file(bucket, key, file_path)
        source = get_source(region, account_id)
        source['s3_url'] = '{}/{}'.format(bucket, key)

        alb_regex = re.compile(r"(?P<type>[^ ]*) (?P<timestamp>[^ ]*) (?P<elb>[^ ]*) (?P<client_ip>[^ ]*):(?P<client_port>[0-9]*) (?P<target_ip>[^ ]*)[:-](?P<target_port>[0-9]*) (?P<request_processing_time>[-.0-9]*) (?P<target_processing_time>[-.0-9]*) (?P<response_processing_time>[-.0-9]*) (?P<elb_status_code>|[-0-9]*) (?P<target_status_code>-|[-0-9]*) (?P<received_bytes>[-0-9]*) (?P<sent_bytes>[-0-9]*) \"(?P<request_verb>[^ ]*) (?P<request_url>[^\"]*) (?P<request_proto>- |[^ ]*)\" \"(?P<user_agent>[^\"]*)\" (?P<ssl_cipher>[A-Z0-9-]+) (?P<ssl_protocol>[A-Za-z0-9.-]*) (?P<target_group_arn>[^ ]*) \"(?P<trace_id>[^\"]*)\" \"(?P<domain_name>[^\"]*)\" \"(?P<chosen_cert_arn>[^\"]*)\" (?P<matched_rule_priority>[-.0-9]*) (?P<request_creation_time>[^ ]*) \"(?P<actions_executed>[^\"]*)\" \"(?P<redirect_url>[^ ]*)\" \"(?P<error_reason>[^ ]*)\"")

        # try to identify file type by looking at first lines
        with gzip.open(file_path, mode='rt', newline='\n') as data:
            header = data.readlines(2048)

        # ALB Access ?
        if alb_regex.match(header[0]):
            logs = Queue("aws.alb_accesslog")

        # cloudfront access logs
        elif len(header) > 1 and re.match('#Version:', header[0]) and re.match('#Fields:', header[1]):
            logs = Queue("aws.cloudfront_accesslog")

        else:
            logger.warning("{}/{}: Unknown type!".format(bucket, key))
            return

        if logs.tag == 'aws.alb_accesslog':
            with gzip.open(file_path, mode='rt', newline='\n') as data:
                for line in data:
                    event = {}
                    parsed = {}
                    data = alb_regex.match(line)
                    if data:
                        for key in data.groupdict().keys():
                            value = data.group(key)

                            # Remove empty values
                            if value in ['-', '-\n']:
                                continue

                            # Remove times of requests timed out
                            if key in ['request_processing_time', 'target_processing_time', 'response_processing_time'] and value in ['-1']:
                                continue

                            parsed[key] = data.group(key)
                    else:
                        logger.warning("Could not parse ALB access log entry: {}".format(line))
                        continue

                    event['time'] = fluentd_time(datetime.datetime.strptime(parsed['request_creation_time'], '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())

                    # Copy to host to allow geoip upstream
                    event['host'] = parsed['client_ip']
                    event[logs.tag] = parsed
                    event['source'] = source

                    logs.send(event)

        elif logs.tag == 'aws.cloudfront_accesslog':
            with gzip.open(file_path, mode='rt', newline='\n') as data:
                next(data)
                # columns are in second line: first is #Fields, next two are merged into time later
                columns = next(data).split()[3:]

                for line in data:
                    event = {}
                    parsed = {}

                    # Todo hash each line to create source['id']
                    # source['id'] = md5.of.line matching ES ids

                    row = line.split('\t')
                    # cloudfront events are logged to the second only, date and time are seperate
                    event['time'] = fluentd_time(datetime.datetime.strptime(row[0] + " " + row[1], '%Y-%m-%d %H:%M:%S').timestamp())

                    for n, c in enumerate(columns, 2):
                        value = row[n]
                        if value not in ['-', '-\n']:
                            parsed[c] = row[n]
                            # Copy c-ip to host to allow geoip upstream
                            if c == 'c-ip':
                                event['host'] = row[n]

                    event[logs.tag] = parsed
                    event['source'] = source

                    logs.send(event)

        logs.flush()
        logs.info()