UnicodeEncodeError: 'ascii' codec can't encode character u'\u0446' in position 32: ordinal not in range(128)

Question

I'm trying to debug some code a previous intern wrote and I'm having some difficulties resolving this issue with answers from other unicode error posts.

The error is found in the last line of this function:

    def dumpTextPacket(self, header, bugLog, offset, outfile):
        bugLog.seek(offset)
        data = bugLog.read( header[1] )        # header[1] = size of the packet 
        outString = data.decode("utf-8","ignore")
        if(header[3] == 8): # Removing ugly characters from packet that has bTag = 8.
            outString = outString[1:]
            outString = outString.strip('\0')  # Remove all 'null' characters from text
        outString = "{:.3f}".format(header[5]) + ' ms: ' + outString              # Append the timestamp to the beginning of the line
        outfile.write(outString)

I don't have much experience with unicode,so I would really appreciate any pointers with this issue!

edit: Using Python 2.7, and below is the entire file. Another thing I should mention is that the code does work when parsing some files, but I think it errors on other files when the timestamp gets too big?

In the main.py file, we call the method LogInterpreter.execute(), and the traceback gives the error shown in the title on the line "outfile.write(outString)", the last line in the dumpTextPacket method which is called in the execute method:

import sys
import os
from struct import unpack
class LogInterpreter:

def __init__( self ):
    self.RTCUpdated = False
    self.RTCOffset = 0.0
    self.LastTimeStamp = 0.0
    self.TimerRolloverCount = 0
    self.ThisTimeStamp = 0.0

    self.m_RTCSeconds = 0.0
    self.m_StartTimeInSec = 0.0

def GetRTCOffset( self ):
    return self.m_RTCSeconds - self.m_StartTimeInSec

def convertTimeStamp(self,uTime,LogRev):
    TicsPerSecond = 24000000.0

    self.ThisTimeStamp = uTime
    self.RTCOffset = self.GetRTCOffset()

    if int( LogRev ) == 2:
        if self.RTCUpdated:
            self.LastTimeStamp = 0.0
        if self.LastTimeStamp > self.ThisTimeStamp:
            self.TimerRolloverCount += 1
        self.LastTimeStamp = self.ThisTimeStamp

    ULnumber = (-1 & 0xffffffff)

    return ((ULnumber/TicsPerSecond)*self.TimerRolloverCount + (uTime/TicsPerSecond) + self.RTCOffset) * 1000.0

##########################################################################
# Information about the header for the current packet we are looking at. #                                   
##########################################################################
def grabHeader(self, bugLog, offset):
    '''
    s_PktHdrRev1
    /*0*/    u16 StartOfPacketMarker; # uShort 2   
    /*2*/    u16 SizeOfPacket;        # uShort 2   
    /*4*/    u08 LogRev;              # uChar  1    
    /*5*/    u08 bTag;                # uChar  1    
    /*6*/    u16 iSeq;                # uShort 2   
    /*8*/    u32 uTime;               # uLong  4
    '''
    headerSize = 12 # Header size in bytes
    bType = 'HHBBHL' # codes for our byte type
    bugLog.seek(offset)
    data = bugLog.read(headerSize)

    if len(data) < headerSize:
        print('Error in the format of BBLog file')
        sys.exit()

    headerArray = unpack(bType, data)
    convertedTime = self.convertTimeStamp(headerArray[5],headerArray[2])
    headerArray = headerArray[:5] + (convertedTime,)
    return headerArray

################################################################
# bTag = 8 or bTag = 16 --> just write the data to LogMsgs.txt #
################################################################
def dumpTextPacket(self, header, bugLog, offset, outfile):
    bugLog.seek(offset)
    data = bugLog.read( header[1] )                               # header[1] = size of the packet 
    outString = data.decode("utf-8","ignore")
    if(header[3] == 8):                                           # Removing ugly characters from packet that has bTag = 8.
        outString = outString[1:]
        outString = outString.strip('\0')                         # Remove all 'null' characters from text
    outString = "{:.3f}".format(header[5]) + ' ms: ' + outString  # Append the timestamp to the beginning of the line
    outfile.write(outString)



def execute(self):
    path = './Logs/'
    for fn in os.listdir(path):
        fileName = fn
        print fn
        if (fileName.endswith(".bin")): 
        # if(fileName.split('.')[1] == "bin"):
            print("Parsing "+fileName)
            outfile = open(path+fileName.split('.')[0]+".txt", "w")           # Open a file for output
            fileSize = os.path.getsize(path+fileName)
            packetOffset = 0
            with open(path+fileName, 'rb') as bugLog:
                while(packetOffset < fileSize):
                    currHeader = self.grabHeader(bugLog, packetOffset)        # Grab the header for the current packet
                    packetOffset = packetOffset + 12                          # Increment the pointer by 12 bytes (size of a header packet)
                    if currHeader[3]==8 or currHeader[3]==16:                 # Look at the bTag and see if it is a text packet
                        self.dumpTextPacket(currHeader, bugLog, packetOffset, outfile)
                    packetOffset = packetOffset + currHeader[1]               # Move on to the next packet by incrementing the pointer by the size of the current packet
            outfile.close()
            print(fileName+" completed.")

Answer 1

When you add together two strings with one of them being Unicode, Python 2 will coerce the result to Unicode too.

>>> 'a' + u'b'
u'ab'

Since you used data.decode , outString will be Unicode.

When you write to a binary file, you must have a byte string. Python 2 will attempt to convert your Unicode string to a byte string, but it uses the most generic codec it has: 'ascii' . This codec fails on many Unicode characters, specifically those with a codepoint above '\' . You can encode it yourself with a more capable codec to get around this problem:

outfile.write(outString.encode('utf-8'))

Everything changes in Python 3, which won't let you mix byte strings and Unicode strings nor attempt any automatic conversions.

UnicodeEncodeError: 'ascii' codec can't encode character u'\u0446' in position 32: ordinal not in range(128)

Question

1 answers

solution1
0 2016-08-09 19:00:57

UnicodeEncodeError: 'ascii' codec can't encode character u'\u0446' in position 32: ordinal not in range(128)

Question

1 answers

solution1 0 2016-08-09 19:00:57

solution1
0 2016-08-09 19:00:57