Python 脚本因大量数据而失败

Question

I wrote a python script that seemed to work pretty well.我写了一个似乎运行良好的 python 脚本。 It lists EC2 instances in AWS and then writes them to our Confluence wiki.它列出了 AWS 中的 EC2 实例，然后将它们写入我们的 Confluence wiki。

If it processes one environment with 10 servers it works and writes to Confluence.如果它处理一个有 10 个服务器的环境，它会工作并写入 Confluence。 If it works against an account with 100 or more servers it fails to write to Confluence with this stack trace:如果它适用于拥有 100 台或更多服务器的帐户，则无法使用以下堆栈跟踪写入 Confluence：

Traceback (most recent call last):
  File ".\aws_ec2_list_instances_orig.py", line 550, in <module>
    main()
  File ".\aws_ec2_list_instances_orig.py", line 543, in main
    write_data_to_confluence(auth, html, pageid, title)
  File ".\aws_ec2_list_instances_orig.py", line 391, in write_data_to_confluence
    r.raise_for_status()
  File "C:\Users\tdunphy\AppData\Roaming\Python\Python37\site-packages\requests\models.py", line 940, in raise_for_status
requests.exceptions.HTTPError: 400 Client Error:  for url: https://wiki.us.cworld.company.com/rest/api/content/138317098

I've also raised a verbose error here:我还在这里提出了一个详细的错误：

Traceback (most recent call last):
  File ".\aws_ec2_list_instances_orig.py", line 538, in <module>
    main()
  File ".\aws_ec2_list_instances_orig.py", line 531, in main
    write_data_to_confluence(auth, html, pageid, title)
  File ".\aws_ec2_list_instances_orig.py", line 380, in write_data_to_confluence
    raise RuntimeError(r.content)
RuntimeError: b'{"statusCode":400,"data":{"authorized":false,"valid":true,"allowedInReadOnlyMode":true,"errors":[],"successful":false},"message":"Error parsing xhtml: Unexpected character \'<\' (code 60); expected a semi-colon after the reference for entity \'C\'\\n at [row,col {unknown-source}]: [1,46579]","reason":"Bad Request"}'

Please note I AM NOT ALLOWED TO POST THE COMPANY DOMAIN IN MY POSTS.请注意，我不允许在我的帖子中发布公司域名。 I will substitute 'company.com' where my real company domain would be.我将在我的真实公司域所在的位置替换“company.com”。

Here is the script:这是脚本：

#!/usr/bin/env python3

# Import modules
import boto3
import time
import objectpath
import csv
import os
import sys
import json
import requests
from requests_kerberos import HTTPKerberosAuth
import codecs
from datetime import datetime
from os.path import basename
from subprocess import check_output,CalledProcessError,PIPE

BASE_URL = "https://wiki.us.cworld.company.com/rest/api/content"
VIEW_URL = "https://wiki.us.cworld.company.com/pages/viewpage.action?pageId="

def banner(message, border='-'):
    line = border * len(message)
    print(line)
    print(message)
    print(line)

def initialize(interactive, aws_account):
    # Set the date
    today = datetime.today()
    today = today.strftime("%m-%d-%Y")
    # Set source files
    aws_env_list="../../source_files/aws_environments/aws_environments_all.txt"
    output_dir = "../../output_files/aws_instance_list/csv/"
    output_file = output_dir + 'aws-instance-master-list-' + aws_account + '-' + today +'.csv'
    output_file_name = 'aws-instance-master-list-' + aws_account + '-' + today +'.csv'
    return today, aws_env_list, output_file, output_file_name

def authenticate():
    #auth = get_login()
    auth = ('tdunphy', 'local4tl4nt1cNJ!')
    auth = str(auth).replace('(','').replace('\'','').replace(',',':').replace(')','').replace(' ','')
    kerberos_auth = HTTPKerberosAuth(mutual_authentication="DISABLED",principal=auth)
    auth = kerberos_auth
    return auth

## These are dummy AWS account numbers. I cannot post account number for my company.
def aws_accounts_to_account_numbers(aws_account):
    switcher = {
        'company-lab': '123456789101',
        'company-bill': '123456789102',
        'company-stage': '123456789103',
        'company-dlab': '123456789103',
    }
    return switcher.get(aws_account, "nothing")


def list_instances(aws_account,aws_account_number, interactive):
    today, aws_env_list, output_file, output_file_name = initialize(interactive, aws_account)
    engagement = None
    # Set the account
    session = boto3.Session(profile_name=aws_account)
    ec2 = session.client("ec2")
    fieldnames = [ 'AWS Account', 'Account Number', 'Name', 'Instance ID', 'VPC ID', 'Type', 'Platform', 'State', 'Key Name', 'Private IP', 'Public IP', 'Private DNS', 'Volumes', 'Availability Zone', 'Launch Date', 'Engagement Code']
    # Set the ec2 dictionary
    ec2info = {}
    public_ips_list = ''
    private_ips_list = ''
    private_dns = None
    with open(output_file, mode='w+') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=',', lineterminator='\n')
        writer.writeheader()
    if 'gov' in aws_account and not 'admin' in aws_account:
        print("This is a gov account.")
        session = boto3.Session(profile_name=aws_account,region_name='us-gov-west-1')
    else:
        print("This is a commercial account.")
        session = boto3.Session(profile_name=aws_account,region_name='us-east-1')

    ec2 = session.client("ec2")
    # Loop through the instances
    instance_list = ec2.describe_instances()
    for reservation in instance_list["Reservations"]:
            for instance in reservation.get("Instances", []):
                launch_time = instance["LaunchTime"]
                launch_time_friendly = launch_time.strftime("%B %d %Y")
                tree = objectpath.Tree(instance)
                block_devices = set(tree.execute('$..BlockDeviceMappings[\'Ebs\'][\'VolumeId\']'))
                if len(block_devices) == 0:
                    block_devices_list = None
                else:
                    block_devices_list = list(block_devices)
                    block_devices_list = str(block_devices_list).replace('[','').replace(']','').replace('\'','').replace('{','').replace('}', '')
                private_ips =  set(tree.execute('$..PrivateIpAddress'))
                if len(private_ips) == 0:
                    private_ips_list = None
                else:
                    private_ips_list = list(private_ips)
                    private_ips_list = str(private_ips_list).replace('[','').replace(']','').replace('\'','')
                public_ips =  set(tree.execute('$..PublicIp'))
                if len(public_ips) == 0:
                    public_ips_list = None
                else:
                    public_ips_list = list(public_ips)
                    public_ips_list = str(public_ips_list).replace('[','').replace(']','').replace('\'','')
                if 'KeyName' in instance:
                    key_name = instance['KeyName']
                else:
                    key_name = None
                name = None
                if 'Tags' in instance:
                    try:
                        tags = instance['Tags']
                        name = None
                        for tag in tags:
                            if tag["Key"] == "Name":
                                name = tag["Value"]
                        for tag in tags:
                            if tag["Key"] == "Engagement":
                                engagement = tag["Value"]
                            else:
                                engagement = None
                    except ValueError:
                        print("Instance: %s has no tags" % instance_id)
                if 'VpcId' in instance:
                    vpc_id = instance['VpcId']
                else:
                    vpc_id = None
                if 'PrivateDnsName' in instance:
                    private_dns = instance['PrivateDnsName']
                else:
                    private_dns = None
                if 'Platform' in instance:
                    platform = instance['Platform']
                else:
                    platform = None
                ec2info[instance['InstanceId']] = {
                    'AWS Account': aws_account,
                    'Account Number': aws_account_number,
                    'Name': name,
                    'Instance ID': instance['InstanceId'],
                    'VPC ID': vpc_id,
                    'Type': instance['InstanceType'],
                    'Platform': platform,
                    'State': instance['State']['Name'],
                    'Key Name': key_name,
                    'Private IP': private_ips_list,
                    'Public IP': public_ips_list,
                    'Private DNS': private_dns,
                    'Volumes': block_devices_list,
                    'Availability Zone': instance['Placement']['AvailabilityZone'],
                    'Launch Date': launch_time_friendly,
                    'Engagement Code': engagement
                }
                with open(output_file,'a') as csv_file:
                    writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=',', lineterminator='\n')
                    writer.writerow({'AWS Account': aws_account, "Account Number": aws_account_number, 'Name': name, 'Instance ID': instance["InstanceId"], 'VPC ID': vpc_id, 'Type': instance["InstanceType"], 'Platform': platform, 'State': instance["State"]["Name"], 'Key Name': key_name,  'Private IP': private_ips_list, 'Public IP': public_ips_list, 'Private DNS': private_dns, 'Volumes': block_devices, 'Availability Zone': instance['Placement']['AvailabilityZone'], 'Launch Date': launch_time_friendly, 'Engagement Code': engagement})
    for instance_id, instance in ec2info.items():
        print(Fore.RESET + "-------------------------------------")
        for key in [
            'AWS Account',
            'Account Number',
            'Name',
            'Instance ID',
            'VPC ID',
            'Type',
            'Platform',
            'Key Name',
            'State',
            'Private IP',
            'Public IP',
            'Private DNS',
            'Volumes',
            'Availability Zone',
            'Launch Date',
            'Engagement Code'
        ]:
            print(Fore.GREEN + "{0}: {1}".format(key, instance.get(key)))   
        time.sleep(2)
    print(Fore.RESET + "-------------------------------------")
    with open(output_file,'a') as csv_file:
        csv_file.close()
    return output_file


def convert_csv_to_html_table(output_file, today, interactive, aws_account):
    output_dir = "../../output_files/aws_instance_list/html/"
    if interactive == 1:
        htmlfile = output_dir + 'aws-instance-master-list-' + aws_account + '-' + today +'.html'
        htmlfile_name = 'aws-instance-master-list-' + aws_account + '-' + today +'.html'
    else:
        htmlfile = output_dir + 'aws-instance-master-list-' + today +'.html'
        htmlfile_name = 'aws-instance-master-list-' + today +'.html'
    count = 0
    html = ''
    with open(output_file,'r') as CSVFILE:
        reader = csv.reader(CSVFILE)
        with open(output_file,'r') as CSVFILE:
            reader = csv.reader(CSVFILE)
            html += "<table><tbody>"
            for row in reader:
                html += "<tr>"
                # Process the headers
                if count == 0:
                    for column in row:
                        html += "<th>%s</th>" % column
                else:
                    # Process the data
                    for column in row:
                        html += "<td>%s</td>" % column
                html += "</tr>"
                count += 1
            html += "</tbody></table>"
    with open(htmlfile,'w+') as HTMLFILE:
        HTMLFILE.write(html)
    return htmlfile, htmlfile_name


def get_page_ancestors(auth, pageid):
    # Get basic page information plus the ancestors property
    url = '{base}/{pageid}?expand=ancestors'.format(
        base = BASE_URL,
        pageid = pageid)
    r = requests.get(url, auth = auth)
    r.raise_for_status()
    return r.json()['ancestors']


def get_page_info(auth, pageid):
    url = '{base}/{pageid}'.format(
        base = BASE_URL,
        pageid = pageid)
    r = requests.get(url, auth = auth)
    r.raise_for_status()
    return r.json()


def write_data_to_confluence(auth, html, pageid, title = None):
    info = get_page_info(auth, pageid)
    ver = int(info['version']['number']) + 1
    ancestors = get_page_ancestors(auth, pageid)
    anc = ancestors[-1]
    del anc['_links']
    del anc['_expandable']
    del anc['extensions']
    if title is not None:
        info['title'] = title
    data = {
        'id' : str(pageid),
        'type' : 'page',
        'title' : info['title'],
        'version' : {'number' : ver},
        'ancestors' : [anc],
        'body'  : {
            'storage' :
            {
                'representation' : 'storage',
                'value' : str(html)
            }
        }
    }
    data = json.dumps(data)
    url = '{base}/{pageid}'.format(base = BASE_URL, pageid = pageid)
    r = requests.put(
        url,
        data = data,
        auth = auth,
        headers = { 'Content-Type' : 'application/json' }
    )
    r.raise_for_status()
    print("Wrote '%s' version %d" % (info['title'], ver))
    print("URL: %s%d" % (VIEW_URL, pageid))

def main():
    pageid = 138317098
    title = 'AWS EC2 Instance List'
    aws_account = input("Enter the name of the AWS account you'll be working in: ")
    aws_account_number = aws_accounts_to_account_numbers(aws_account)
    today, aws_env_list, output_file, output_file_name = initialize(interactive, aws_account)
    output_file = list_instances(aws_account,aws_account_number, interactive)
    htmlfile, htmlfile_name = convert_csv_to_html_table(output_file, today, interactive, aws_account)
    with open(htmlfile, 'r', encoding='utf-8') as htmlfile:
        html = htmlfile.read()
    auth = authenticate()
    write_data_to_confluence(auth, html, pageid, title)


if __name__ == "__main__":
    main()

Why does this script fail to write to confluence only when it processes a lot of servers?为什么这个脚本只有在处理大量服务器时才无法写入confluence？

Answer 1

This is a little bit tricky to diagnose without being able to see the data you're working with.在无法查看您正在使用的数据的情况下进行诊断有点棘手。 As noted by fpbhb, the error message suggests that there is an issue with the generated html (probably an & somewhere in the input).正如 fpbhb 所指出的，错误消息表明生成的 html 存在问题（可能是输入中的&某处）。 I would try escaping the CSV field data before wrapping it in the HTML tags:我会尝试 escaping CSV 字段数据，然后将其包装在 HTML 标签中：

from html import escape


def convert_csv_to_html_table(output_file, today, interactive, aws_account):
    # [...]

    count = 0
    html = ''
    with open(output_file,'r') as CSVFILE:
        reader = csv.reader(CSVFILE)
        html += "<table><tbody>"
        for row in reader:
            html += "<tr>"
            # Process the headers
            if count == 0:
                for column in row:
                    html += "<th>%s</th>" % escape(column)
            else:
                # Process the data
                for column in row:
                    html += "<td>%s</td>" % escape(column)
            html += "</tr>"
            count += 1
        html += "</tbody></table>"
    with open(htmlfile,'w+') as HTMLFILE:
        HTMLFILE.write(html)
    return htmlfile, htmlfile_name

Python 脚本因大量数据而失败

问题描述

1 个解决方案

解决方案1
1 已采纳 2019-11-17 01:33:11

Python 脚本因大量数据而失败

问题描述

1 个解决方案

解决方案1 1 已采纳 2019-11-17 01:33:11

解决方案1
1 已采纳 2019-11-17 01:33:11