Introduce shorter timeouts for AWS API calls to fail fast and prevent cost explosions due to high log volume
continuous-integration/drone/push Build encountered an error Details

This commit is contained in:
Stefan Reimer 2020-01-04 16:11:52 +00:00
parent f0fd0c1e07
commit ce57cf70f4
2 changed files with 20 additions and 13 deletions

View File

@ -1,5 +1,10 @@
# Changelog
## 0.9.9
- Improved error handling / descreased timouts calling AWS APIs to fail fast in case AWS throttles API calls
which causes each Lambda call to timout and run for 300s
- Skip over more invalid vpc flowlog entries
## 0.9.8
- Fix for ALB AccessLog parser to handle spaces in request_url
- Improved VPC FlowLog metadata augmentation

View File

@ -14,10 +14,11 @@ import io
import urllib
import datetime
import boto3
import botocore
__author__ = "Stefan Reimer"
__author_email__ = "stefan@zero-downtime.net"
__version__ = "0.9.8"
__version__ = "0.9.9"
# IAM Alias lookup cache
account_aliases = {}
@ -89,13 +90,14 @@ def get_source(region, account_id):
if RESOLVE_ACCOUNT and not TEST:
try:
if account_id not in account_aliases:
iam = boto3.client('iam')
boto3_config = botocore.config.Config(retries=dict(max_attempts=2), connect_timeout=3, read_timeout=5)
iam = boto3.client('iam', config=boto3_config)
account_aliases[account_id] = iam.list_account_aliases()['AccountAliases'][0]
source['account_alias'] = account_aliases[account_id]
except(KeyError, IndexError):
logger.warning("Could not resolve IAM account alias")
except(botocore.exceptions.ConnectTimeoutError, KeyError, IndexError):
logger.warning("Could not resolve IAM account alias, disabled for this session")
RESOLVE_ACCOUNT = False
pass
@ -111,16 +113,17 @@ def add_flow_metadata(flow):
try:
# Check cache and update if missed with all ENIs in one go
if flow['interface-id'] not in enis:
ec2 = boto3.client('ec2')
boto3_config = botocore.config.Config(retries=dict(max_attempts=2), connect_timeout=3, read_timeout=5)
ec2 = boto3.client('ec2', config=boto3_config)
interface_iter = ec2.get_paginator('describe_network_interfaces').paginate()
for response in interface_iter:
for interface in response['NetworkInterfaces']:
# Lookup table keyed bt ENI ID
# Lookup table by ENI ID
enis[interface['NetworkInterfaceId']] = interface
# Lookup table by IP to classify traffic
ips[interface['PrivateIpAddress']] = interface
except(KeyError, IndexError):
except(botocore.exceptions.ConnectTimeoutError, KeyError, IndexError):
logger.warning("Error trying to get metadata for ENIs, disabling ENHANCE_FLOWLOG")
ENHANCE_FLOWLOG = False
return flow
@ -210,7 +213,7 @@ class Queue:
_url = '{}/{}'.format(self.url.geturl(), self.tag)
while True:
try:
r = self.request.post(url=_url, data=msgpack.packb(self._queue), verify=self.verify_certs)
r = self.request.post(url=_url, data=msgpack.packb(self._queue), verify=self.verify_certs, timeout=(6, 30))
if r:
break
else:
@ -220,12 +223,11 @@ class Queue:
logger.warning("RequestException: {}".format(e))
pass
if retries >= 8:
if retries >= 2:
raise Exception("Error sending {} events to {}. Giving up.".format(events, _url))
retries = retries + 1
logger.warning("Error sending {} events to {}. Retrying in {} seconds.".format(events, _url, retries**2))
time.sleep(retries**2)
time.sleep(1)
else:
logger.debug("Test mode, dump only: {}".format(msgpack.packb(self._queue)))
@ -358,8 +360,8 @@ def handler(event, context):
elif logs.tag == 'aws.vpcflowlog':
row = e['message'].split(" ")
# Skip over NODATA entries, what would be the point having these in ES ?
if row[13] == 'NODATA':
# Skip over NODATA,SKIPDATA entries, what would be the point having these in ES ?
if row[13] != 'OK':
continue
parsed = add_flow_metadata({'interface-id': row[2], 'srcaddr': row[3], 'dstaddr': row[4], 'srcport': row[5], 'dstport': row[6], 'protocol': row[7],