UnicodeEncodeError: 'ascii' codec can't encode character u'\ц' in position 32: ordinal not in range(128)

Question

我正在嘗試調試以前實習生編寫的一些代碼，但在使用其他 unicode 錯誤帖子的答案解決此問題時遇到了一些困難。

在這個函數的最后一行發現錯誤：

    def dumpTextPacket(self, header, bugLog, offset, outfile):
        bugLog.seek(offset)
        data = bugLog.read( header[1] )        # header[1] = size of the packet 
        outString = data.decode("utf-8","ignore")
        if(header[3] == 8): # Removing ugly characters from packet that has bTag = 8.
            outString = outString[1:]
            outString = outString.strip('\0')  # Remove all 'null' characters from text
        outString = "{:.3f}".format(header[5]) + ' ms: ' + outString              # Append the timestamp to the beginning of the line
        outfile.write(outString)

我對unicode沒有太多經驗，所以我真的很感激這個問題的任何指針！

編輯：使用 Python 2.7，以下是整個文件。 我應該提到的另一件事是，該代碼在解析某些文件時確實有效，但我認為當時間戳變得太大時它會在其他文件上出錯？

在 main.py 文件中，我們調用了 LogInterpreter.execute() 方法，並且回溯給出了“outfile.write(outString)”行標題中顯示的錯誤，這是在 dumpTextPacket 方法中調用的最后一行執行方法：

import sys
import os
from struct import unpack
class LogInterpreter:

def __init__( self ):
    self.RTCUpdated = False
    self.RTCOffset = 0.0
    self.LastTimeStamp = 0.0
    self.TimerRolloverCount = 0
    self.ThisTimeStamp = 0.0

    self.m_RTCSeconds = 0.0
    self.m_StartTimeInSec = 0.0

def GetRTCOffset( self ):
    return self.m_RTCSeconds - self.m_StartTimeInSec

def convertTimeStamp(self,uTime,LogRev):
    TicsPerSecond = 24000000.0

    self.ThisTimeStamp = uTime
    self.RTCOffset = self.GetRTCOffset()

    if int( LogRev ) == 2:
        if self.RTCUpdated:
            self.LastTimeStamp = 0.0
        if self.LastTimeStamp > self.ThisTimeStamp:
            self.TimerRolloverCount += 1
        self.LastTimeStamp = self.ThisTimeStamp

    ULnumber = (-1 & 0xffffffff)

    return ((ULnumber/TicsPerSecond)*self.TimerRolloverCount + (uTime/TicsPerSecond) + self.RTCOffset) * 1000.0

##########################################################################
# Information about the header for the current packet we are looking at. #                                   
##########################################################################
def grabHeader(self, bugLog, offset):
    '''
    s_PktHdrRev1
    /*0*/    u16 StartOfPacketMarker; # uShort 2   
    /*2*/    u16 SizeOfPacket;        # uShort 2   
    /*4*/    u08 LogRev;              # uChar  1    
    /*5*/    u08 bTag;                # uChar  1    
    /*6*/    u16 iSeq;                # uShort 2   
    /*8*/    u32 uTime;               # uLong  4
    '''
    headerSize = 12 # Header size in bytes
    bType = 'HHBBHL' # codes for our byte type
    bugLog.seek(offset)
    data = bugLog.read(headerSize)

    if len(data) < headerSize:
        print('Error in the format of BBLog file')
        sys.exit()

    headerArray = unpack(bType, data)
    convertedTime = self.convertTimeStamp(headerArray[5],headerArray[2])
    headerArray = headerArray[:5] + (convertedTime,)
    return headerArray

################################################################
# bTag = 8 or bTag = 16 --> just write the data to LogMsgs.txt #
################################################################
def dumpTextPacket(self, header, bugLog, offset, outfile):
    bugLog.seek(offset)
    data = bugLog.read( header[1] )                               # header[1] = size of the packet 
    outString = data.decode("utf-8","ignore")
    if(header[3] == 8):                                           # Removing ugly characters from packet that has bTag = 8.
        outString = outString[1:]
        outString = outString.strip('\0')                         # Remove all 'null' characters from text
    outString = "{:.3f}".format(header[5]) + ' ms: ' + outString  # Append the timestamp to the beginning of the line
    outfile.write(outString)



def execute(self):
    path = './Logs/'
    for fn in os.listdir(path):
        fileName = fn
        print fn
        if (fileName.endswith(".bin")): 
        # if(fileName.split('.')[1] == "bin"):
            print("Parsing "+fileName)
            outfile = open(path+fileName.split('.')[0]+".txt", "w")           # Open a file for output
            fileSize = os.path.getsize(path+fileName)
            packetOffset = 0
            with open(path+fileName, 'rb') as bugLog:
                while(packetOffset < fileSize):
                    currHeader = self.grabHeader(bugLog, packetOffset)        # Grab the header for the current packet
                    packetOffset = packetOffset + 12                          # Increment the pointer by 12 bytes (size of a header packet)
                    if currHeader[3]==8 or currHeader[3]==16:                 # Look at the bTag and see if it is a text packet
                        self.dumpTextPacket(currHeader, bugLog, packetOffset, outfile)
                    packetOffset = packetOffset + currHeader[1]               # Move on to the next packet by incrementing the pointer by the size of the current packet
            outfile.close()
            print(fileName+" completed.")

Answer 1

當您將兩個字符串加在一起並且其中一個是 Unicode 時，Python 2 也會將結果強制轉換為 Unicode。

>>> 'a' + u'b'
u'ab'

由於您使用了data.decode ， outString將是 Unicode。

當你寫入一個二進制文件時，你必須有一個字節串。 Python 2 將嘗試將您的 Unicode 字符串轉換為字節字符串，但它使用最通用的編解碼器： 'ascii' 。 此編解碼器在許多 Unicode 字符上失敗，特別是那些代碼點高於'\'字符。 您可以使用功能更強大的編解碼器自行對其進行編碼以解決此問題：

outfile.write(outString.encode('utf-8'))

Python 3 中的一切都發生了變化，它不會讓您混合字節字符串和 Unicode 字符串，也不會嘗試任何自動轉換。

UnicodeEncodeError: 'ascii' codec can't encode character u'\ц' in position 32: ordinal not in range(128)

問題描述

1 個解決方案

解決方案1
0 2016-08-09 19:00:57

UnicodeEncodeError: &#39;ascii&#39; codec can&#39;t encode character u&#39;\ц&#39; in position 32: ordinal not in range(128)

問題描述

1 個解決方案

解決方案1 0 2016-08-09 19:00:57

UnicodeEncodeError: 'ascii' codec can't encode character u'\ц' in position 32: ordinal not in range(128)

解決方案1
0 2016-08-09 19:00:57