簡體   English   中英

PIL /枕頭IOError:無法識別圖像文件u'xxxx.jpg'(僅Windows)

[英]PIL/Pillow IOError: cannot identify image file u'xxxx.jpg' (Windows only)

我已經構建了Python腳本,並且在OS X / Linux上運行良好,但是在Windows中卻遇到了問題(請參見標題)。 它使用的是Pillow模塊,錯誤起源於第2274行的模塊PIL\\Image.py

我的代碼:

# -*- coding: utf-8 -*-

import os
import sys
import urllib2
from PIL import Image, ImageFile
from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger
from bs4 import BeautifulSoup

ImageFile.LOAD_TRUNCATED_IMAGES = True

def parser():
    try:
        return sys.argv[1].lower()
    except IndexError:
        print 'no argument specified'


the_url = 'http://www.oldgames.sk'
base_url = the_url + '/mags/'

# Add magazines + relative URLs here
magazines = {
        'score': 'score/',
        'level': 'level/',
        'amiga': 'amiga-magazin/',
        'bit': 'bit/',
        'commodore': 'commodore-amater/',
        'CGW': 'cgw/',
        'excalibur': 'excalibur/',
        'hrac': 'hrac-cz/',
        'joystick': 'joystick-sk/',
        'pocitac-aktivne': 'pocitac-aktivne/',
        'pocitacove-hry': 'pocitacove-hry/',
        'riki': 'riki/',
        'zzap64': 'zzap64/'}

issue_links = []
download_list = {}

def parse_args(arg):
    if arg == '--list':
        items = [i for i in magazines.keys()]
        for item in items:
            print item
        sys.exit()
    elif arg in magazines:
        print "Scraping %s magazine..." % arg.capitalize()
        return base_url + magazines[arg]
    else:
        return sys.exit('invalid magazine name')

def extract_links_to_issue(url):
    soup = BeautifulSoup(urllib2.urlopen(url))

    for div in soup.findAll('div','mImage'):
        issue_links.append(the_url + div.a['href'])

    print 'Scraped %d links' % len(issue_links)

def issue_renamer(issue_name):
    char1 = '\\'
    char2 = '/'
    replacement = '-'
    if char1 in issue_name:
        issue_name = issue_name.replace(char1, replacement)
        print 'inv. char (%s): renaming to %s' % (char1, issue_name)
    elif char2 in issue_name:
        issue_name = issue_name.replace(char2, replacement)
        print 'inv. char (%s): renaming to %s' % (char2, issue_name)

    return issue_name

def extract_links_to_images(issue_links):
    for index, link in enumerate(issue_links):
        print 'Scraping issue #%d: %s' % (index + 1, link)
        issue_soup = BeautifulSoup(urllib2.urlopen(link))
        image_list = []
        for image in issue_soup.findAll('div', 'mags_thumb_article'):
            issue_name = issue_renamer(issue_soup.findAll('h1','top')[0].text)
            image_list.append(the_url + image.a['href'])

        download_list[issue_name] = image_list

def clean_up(list_of_files, list_of_pdfs):
    num = len(list_of_files) + len(list_of_pdfs)
    for file in list_of_files:
        os.remove(file)
    for pdf in list_of_pdfs:
        os.remove(pdf)
    print 'Cleaned up %d files' % num

def convert_images(list_of_files, issue):
    list_of_pdfs = []
    for index, file in enumerate(list_of_files):
        im = Image.open(file)
        outfile = file + '.pdf'
        im.save(outfile, 'PDF')
        list_of_pdfs.append(outfile)

        print 'converting ...' + str((index + 1)) + '/' + str(len(list_of_files))

    final_pdf = PdfFileMerger()
    for pdf in list_of_pdfs:
        final_pdf.append(open(pdf, 'rb'))

    issue_name = issue + '.pdf'
    final_pdf.write(open(issue_name, 'wb'))
    final_pdf.close()
    print '--- PDF completed ---'

    clean_up(list_of_files, list_of_pdfs)

def download_images(download_list):
    for issues,image_list in download_list.items():
        print 'Preparing %s ...' % issues
        list_of_files = []
        for image in image_list:
            image_name = os.path.split(image)[1]
            list_of_files.append(image_name)
            f = open(image_name, 'w')
            f.write(urllib2.urlopen(image).read())
            print 'Downloading image: %s' % image
            f.close()
        convert_images(list_of_files, issues)


arg = parser()
extract_links_to_issue(parse_args(arg))
extract_links_to_images(issue_links)
download_images(download_list)

我想解決這個問題,有人可以幫我嗎?

您正在將圖像復制到以文本模式打開的文件中:

f = open(image_name, 'w')
f.write(urllib2.urlopen(image).read())

在Windows上,這意味着任何0A(換行符)字節都將轉換為0D 0A字節序列(回車符,換行符),因為這是Windows的行分隔符。

二進制模式打開文件:

f = open(image_name, 'wb')
f.write(urllib2.urlopen(image).read())

我將切換為使用文件作為上下文管理器(帶有with語句),因此您不必手動關閉它,而使用shutil.copyfileobj()將數據直接流式傳輸到磁盤(以塊為單位)而不是讀取一次性將整個圖像存入內存:

import shutil

# ...
with open(image_name, 'wb') as f:
    shutil.copyfileobj(urllib2.urlopen(image), f)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM