[英]How do I modify my “download” function to work with 301/302 redirects?
[英]How do I modify this download function in Python?
现在,这很不稳定。 Gzip,图片,有时不起作用。
如何修改此下载功能,使其可以正常使用? (不管是gzip还是任何标头?)
如果是gzip,我如何自动“ 检测 ”? 我不想总是像现在这样通过True / False。
def download(source_url, g = False, correct_url = True):
try:
socket.setdefaulttimeout(10)
agents = ['Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)','Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)','Microsoft Internet Explorer/4.0b1 (Windows 95)','Opera/8.00 (Windows NT 5.1; U; en)']
ree = urllib2.Request(source_url)
ree.add_header('User-Agent',random.choice(agents))
ree.add_header('Accept-encoding', 'gzip')
opener = urllib2.build_opener()
h = opener.open(ree).read()
if g:
compressedstream = StringIO(h)
gzipper = gzip.GzipFile(fileobj=compressedstream)
data = gzipper.read()
return data
else:
return h
except Exception, e:
return ""
检查Content-Encoding
标头:
import urllib2
import socket
import random
import StringIO
import gzip
def download(source_url):
try:
socket.setdefaulttimeout(10)
agents = ['Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)','Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)','Microsoft Internet Explorer/4.0b1 (Windows 95)','Opera/8.00 (Windows NT 5.1; U; en)']
ree = urllib2.Request(source_url)
ree.add_header('User-Agent',random.choice(agents))
ree.add_header('Accept-encoding', 'gzip')
opener = urllib2.build_opener()
response = opener.open(ree)
encoding=response.headers.getheader('Content-Encoding')
content = response.read()
if encoding and 'gzip' in encoding:
compressedstream = StringIO.StringIO(content)
gzipper = gzip.GzipFile(fileobj=compressedstream)
data = gzipper.read()
return data
else:
return content
except urllib2.URLError as e:
return ""
data=download('http://api.stackoverflow.com/1.0/questions/3708418?type=jsontext')
print(data)
如果您要处理的服务器未将Content-Encoding报告为gzip,则可以通过先try
来提高攻击性:
def download(source_url):
try:
socket.setdefaulttimeout(10)
agents = ['Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)','Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)','Microsoft Internet Explorer/4.0b1 (Windows 95)','Opera/8.00 (Windows NT 5.1; U; en)']
ree = urllib2.Request(source_url)
ree.add_header('User-Agent',random.choice(agents))
ree.add_header('Accept-encoding', 'gzip')
opener = urllib2.build_opener()
response = opener.open(ree)
content = response.read()
compressedstream = StringIO.StringIO(content)
gzipper = gzip.GzipFile(fileobj=compressedstream)
try:
data = gzipper.read()
except IOError:
data = content
return data
except urllib2.URLError as e:
return ""
要检测要下载的数据类型,应将h = opener.open(ree).read()
替换h = opener.open(ree).read()
h = opener.open(ree)
。
现在在h中,您有了响应对象。 您可以使用h.headers(类似dict的对象)对象来分析标头。 特别是您将对标头“ content-type”和“ content-encoding”感兴趣。 您可以通过分析确定要发送的内容。
def download(source_url, correct_url = True):
try:
socket.setdefaulttimeout(10)
agents = ['Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)','Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)','Microsoft Internet Explorer/4.0b1 (Windows 95)','Opera/8.00 (Windows NT 5.1; U; en)']
ree = urllib2.Request(source_url)
ree.add_header('User-Agent',random.choice(agents))
ree.add_header('Accept-encoding', 'gzip')
opener = urllib2.build_opener()
h = opener.open(ree)
if 'gzip' in h.headers.get('content-type', '') or
'gzip' in h.headers.get('content-encoding', ''):
compressedstream = StringIO(h.read())
gzipper = gzip.GzipFile(fileobj=compressedstream)
data = gzipper.read()
return data
else:
return h.read()
except Exception, e:
return ""
import urllib2
import StringIO
import gzip
req = urllib2.Request('http:/foo/')
h = urllib2.urlopen(req)
data = resp.read()
if 'gzip' in resp.headers['Content-Encoding']:
compressedstream = StringIO(h)
gzipper = gzip.GzipFile(fileobj=compressedstream)
data = gzipper.read()
# etc...
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.