简体   繁体   中英

AWS Lambda loop through files using boto3

I have the following lambda function that will search my my s3 bucket with the prefix being the current time in milliseconds. I have about 600-800k files per hour that I would like to do some manipulation to. This code works as intended but takes forever to scan the prefix. I have a feeling that this part of my code is not efficient. Since this lambda function is scheduled to run every 10 mins I have my min range set to go back up to 11 mins in milliseconds. I would greatly appreciate if someone could help me make this piece more efficient if possible.

import os
import boto3
import json
import tempfile
import re
from datetime import date, datetime,timezone
import _datetime
import time

def lambda_handler(event, context):
    # TODO implement
    s3_client = boto3.client("s3")
    s3_resource = boto3.resource('s3')
    paginator = s3_client.get_paginator('list_objects_v2')
    keys = []
    result = []
    now = int(round(time.time() * 1000))
    now_min =  now - 660000 # 11 mins
    times = list(range(now_min,now+1))
    for t in times:
        prefix = 'Uploads/' + str(datetime.now(timezone.utc).strftime("%Y-%m-%d")) + '/' + str(datetime.utcnow().strftime('%H')) + '/' + str(t)
        pages = paginator.paginate(Bucket='bucket', Prefix=prefix)
        for page in pages:
            if page.get('KeyCount') != 0:
                for obj in page['Contents']:
                    keys.append(obj['Key'])
    for key in keys[1:]:

The goal is take these 800k files and condense them into multiple larger files instead of having 800k small files.

for key in keys[1:]:
        local_filepath = os.path.join(tempfile.gettempdir(), key)
        regex_local_filepath = '/tmp/' + re.search('([^\/]+$)', local_filepath).group(0)
        re_key = re.search('([^-/]+$)', key).group(0)
        re_key = re_key.replace('.json','')
        s3_resource.Bucket('bucket').download_file(key,regex_local_filepath)
        with open (regex_local_filepath,'r') as infile:
            result.append(json.load(infile))
        file_name = 'Uploads/' +  str(datetime.now(timezone.utc).strftime("%Y-%m-%d")) + '/' + str(datetime.utcnow().strftime('%H')) + '/' + str(now) + '.json'
        s3object = s3_resource.Object('new-bucket', file_name)
        s3object.put(
            Body=(bytes(json.dumps(result, indent=2, sort_keys=True).encode('UTF-8')))
            )
    return None

I have figured out the correct way to efficiently loop through. It seems I was looping through multiple times and appending times to the keys.

If one needs to condense s3 files into larger single files. This approach works amazingly well. Cheers!

import os
import boto3
import json
import tempfile
import re
from datetime import date, datetime,timezone
import _datetime
import time

def lambda_handler(event, context):
    # TODO implement
    s3_client = boto3.client("s3")
    s3_resource = boto3.resource('s3')
    paginator = s3_client.get_paginator('list_objects_v2')
    now = int(round(time.time() * 1000))
    min_now = now - 360000 # Go back 6 mins since lambda function runs every 5 mins
    max_now = now + 60000 # This is to handle minute 59 after the hour.
    keys = []
    regex_keys = []
    result = []
    content_keys = []
    my_bucket = s3_resource.Bucket('bucket')
    prefix = 'Uploads/'
    key_objects = iter(my_bucket.objects.filter(Prefix=prefix))
    next(key_objects)
    for object_summary in key_objects:
        obj_key = object_summary.key # This gives me all the keys in the above prefix
        keys.append(obj_key)
    for key in keys:
        regex_key = re.search('\/(.*?)\-', key).group(0).replace('/','').replace('-','') # I just want the timestamp (miliseconds)
        regex_keys.append(regex_key)
    for regex_key in regex_keys:
        if min_now <= int(regex_key) <= max_now:
            prefix = 'Uploads/' + str(regex_key)
            pages = paginator.paginate(Bucket='bucket', Prefix=prefix)
            for page in pages:
                for obj in page['Contents']:
                    content_keys.append(obj['Key'])
    print(len(content_keys))
    return None

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM