[英]How do I translate an AWS S3 url into a bucket name for boto?
I'm trying to access the http://s3.amazonaws.com/commoncrawl/parse-output/segment/ bucket with boto.我正在尝试使用 boto 访问http://s3.amazonaws.com/commoncrawl/parse-output/segment/存储桶。 I can't figure out how to translate this into a name for boto.s3.bucket.Bucket().我不知道如何将其转换为 boto.s3.bucket.Bucket() 的名称。
This is the gist of what I'm going for:这是我要做的事情的要点:
s3 = boto.connect_s3()
cc = boto.s3.bucket.Bucket(connection=s3, name='commoncrawl/parse-output/segment')
requester = {'x-amz-request-payer':'requester'}
contents = cc.list(headers=requester)
for i,item in enumerate(contents):
print item.__repr__()
I get "boto.exception.S3ResponseError: S3ResponseError: 400 Bad Request... The specified bucket is not valid..."我收到“boto.exception.S3ResponseError:S3ResponseError:400 错误请求...指定的存储桶无效...”
TheAWS documents list four possible url formats for S3 -- here's something I just threw together to extract the bucket and region for all of the different url formats. AWS 文档列出了 S3 的四种可能的 url 格式——这是我刚刚汇总的内容,用于提取所有不同 url 格式的存储桶和区域。
import re
def bucket_name_from_url(url):
""" Gets bucket name and region from url, matching any of the different formats for S3 urls
* http://bucket.s3.amazonaws.com
* http://bucket.s3-aws-region.amazonaws.com
* http://s3.amazonaws.com/bucket
* http://s3-aws-region.amazonaws.com/bucket
returns bucket name, region
"""
match = re.search('^https?://(.+).s3.amazonaws.com/', url)
if match:
return match.group(1), None
match = re.search('^https?://(.+).s3-([^.]+).amazonaws.com/', url)
if match:
return match.group(1), match.group(2)
match = re.search('^https?://s3.amazonaws.com/([^\/]+)', url)
if match:
return match.group(1), None
match = re.search('^https?://s3-([^.]+).amazonaws.com/([^\/]+)', url)
if match:
return match.group(2), match.group(1)
return None, None
Something like this should really go into boto... Amazon, I hope you're listening像这样的东西真的应该进入 boto ......亚马逊,我希望你在听
EDIT 10/10/2018 : The bucket regexes should now capture bucket names with periods.编辑 10/10/2018 :桶正则表达式现在应该捕获带句点的桶名称。
Extended Marks answer to return keys扩展标记回答返回键
#!/usr/bin/env python
import re
def parse_s3_url(url):
# returns bucket_name, region, key
bucket_name = None
region = None
key = None
# http://bucket.s3.amazonaws.com/key1/key2
match = re.search('^https?://([^.]+).s3.amazonaws.com(.*?)$', url)
if match:
bucket_name, key = match.group(1), match.group(2)
# http://bucket.s3-aws-region.amazonaws.com/key1/key2
match = re.search('^https?://([^.]+).s3-([^\.]+).amazonaws.com(.*?)$', url)
if match:
bucket_name, region, key = match.group(1), match.group(2), match.group(3)
# http://s3.amazonaws.com/bucket/key1/key2
match = re.search('^https?://s3.amazonaws.com/([^\/]+)(.*?)$', url)
if match:
bucket_name, key = match.group(1), match.group(2)
# http://s3-aws-region.amazonaws.com/bucket/key1/key2
match = re.search('^https?://s3-([^.]+).amazonaws.com/([^\/]+)(.*?)$', url)
if match:
bucket_name, region, key = match.group(2), match.group(1), match.group(3)
return list( map(lambda x: x.strip('/') if x else None, [bucket_name, region, key] ) )
The bucket name would be commoncrawl.存储桶名称将是 commoncrawl。 Everything that appears after that is really just part of the name of the keys that appear in the bucket.之后出现的所有内容实际上只是存储桶中出现的密钥名称的一部分。
Here it is my JS version:这是我的 JS 版本:
function parseS3Url(url) {
// Process all aws s3 url cases
url = decodeURIComponent(url);
let match = "";
// http://s3.amazonaws.com/bucket/key1/key2
match = url.match(/^https?:\/\/s3.amazonaws.com\/([^\/]+)\/?(.*?)$/);
if (match) {
return {
bucket: match[1],
key: match[2],
region: ""
};
}
// http://s3-aws-region.amazonaws.com/bucket/key1/key2
match = url.match(/^https?:\/\/s3-([^.]+).amazonaws.com\/([^\/]+)\/?(.*?)$/);
if (match) {
return {
bucket: match[2],
key: match[3],
region: match[1]
};
}
// http://bucket.s3.amazonaws.com/key1/key2
match = url.match(/^https?:\/\/([^.]+).s3.amazonaws.com\/?(.*?)$/);
if (match) {
return {
bucket: match[1],
key: match[2],
region: ""
};
}
// http://bucket.s3-aws-region.amazonaws.com/key1/key2
match = url.match(/^https?:\/\/([^.]+).s3-([^\.]+).amazonaws.com\/?(.*?)$/);
if (match) {
return {
bucket: match[1],
key: match[3],
region: match[2]
};
}
return {
bucket: "",
key: "",
region: ""
};
}
Basing on Mark's answer I've made a small pyparsing
script that is clearer to me (include possible key matches):根据马克的回答,我制作了一个对我来说更清晰的小pyparsing
脚本(包括可能的关键匹配):
#!/usr/bin/env python
from pyparsing import Word, alphanums, Or, Optional, Combine
schema = Or(['http://', 'https://']).setResultsName('schema')
word = Word(alphanums + '-', min=1)
bucket_name = word.setResultsName('bucket')
region = word.setResultsName('region')
key = Optional('/' + word.setResultsName('key'))
"bucket.s3.amazonaws.com"
opt1 = Combine(schema + bucket_name + '.s3.amazonaws.com' + key)
"bucket.s3-aws-region.amazonaws.com"
opt2 = Combine(schema + bucket_name + '.' + region + '.amazonaws.com' + key)
"s3.amazonaws.com/bucket"
opt3 = Combine(schema + 's3.amazonaws.com/' + bucket_name + key)
"s3-aws-region.amazonaws.com/bucket"
opt4 = Combine(schema + region + ".amazonaws.com/" + bucket_name + key)
tests = [
"http://bucket-name.s3.amazonaws.com",
"https://bucket-name.s3-aws-region-name.amazonaws.com",
"http://s3.amazonaws.com/bucket-name",
"https://s3-aws-region-name.amazonaws.com/bucket-name",
"http://bucket-name.s3.amazonaws.com/key-name",
"https://bucket-name.s3-aws-region-name.amazonaws.com/key-name",
"http://s3.amazonaws.com/bucket-name/key-name",
"https://s3-aws-region-name.amazonaws.com/bucket-name/key-name",
]
s3_url = Or([opt1, opt2, opt3, opt4]).setResultsName('url')
for test in tests:
result = s3_url.parseString(test)
print "found url: " + str(result.url)
print "schema: " + str(result.schema)
print "bucket name: " + str(result.bucket)
print "key name: " + str(result.key)
Originally I made Mark's script also retrieve the key (object):原来我让马克的脚本也检索了密钥(对象):
def parse_s3_url(url):
""" Gets bucket name and region from url, matching any of the different formats for S3 urls
* http://bucket.s3.amazonaws.com
* http://bucket.s3-aws-region.amazonaws.com
* http://s3.amazonaws.com/bucket
* http://s3-aws-region.amazonaws.com/bucket
returns bucket name, region
"""
match = re.search('^https?://([^.]+).s3.amazonaws.com(/\([^.]+\))', url)
if match:
return match.group(1), None, match.group(2)
match = re.search('^https?://([^.]+).s3-([^.]+).amazonaws.com/', url)
if match:
return match.group(1), match.group(2), match.group(3)
match = re.search('^https?://s3.amazonaws.com/([^\/]+)', url)
if match:
return match.group(1), None, match.group(2)
match = re.search('^https?://s3-([^.]+).amazonaws.com/([^\/]+)', url)
if match:
return match.group(2), match.group(1), match.group(3)
return None, None, None
The other answers would not support S3 urls like "s3://bucket/key", so I wrote a python function inspired on the Java wrapper :其他答案不支持像“s3://bucket/key”这样的 S3 url,所以我写了一个受Java 包装器启发的 python 函数:
def bucket_name_from_url(url):
"""
A URI wrapper that can parse out information about an S3 URI.
Implementation based on com.amazonaws.services.s3.AmazonS3URI
:param url: the URL to parse
:return: the bucket and the key
"""
uri = urlparse(url)
if uri.scheme == "s3":
bucket = uri.netloc
path = uri.path
if len(path) <= 1:
# s3://bucket or s3://bucket/
key = None
else:
# s3://bucket/key
# Remove the leading '/'.
key = path[1:]
return bucket, key
match = re.search('^https://(.+\.)?s3[.-]([a-z0-9-]+)\.', url)
prefix = match.group(1)
if not prefix:
# No bucket name in the authority; parse it from the path.
path = uri.path
index = path.find('/', 1)
if index == -1:
# https://s3.amazonaws.com/bucket
bucket = urllib.unquote(path[1:])
key = None
elif index == (len(path) - 1):
# https://s3.amazonaws.com/bucket/
bucket = urllib.unquote(path[1:index])
key = None
else:
bucket = urllib.unquote(path[1:index])
key = urllib.unquote(path[index+1:])
else:
# Bucket name was found in the host; path is the key.
bucket = prefix[0:len(prefix)-1]
path = uri.path
if not path or path == "/":
key = None
else:
# Remove the leading '/'.
key = path[1:]
return bucket, key
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.