I'm trying to debug some code a previous intern wrote and I'm having some difficulties resolving this issue with answers from other unicode error posts.
The error is found in the last line of this function:
def dumpTextPacket(self, header, bugLog, offset, outfile):
bugLog.seek(offset)
data = bugLog.read( header[1] ) # header[1] = size of the packet
outString = data.decode("utf-8","ignore")
if(header[3] == 8): # Removing ugly characters from packet that has bTag = 8.
outString = outString[1:]
outString = outString.strip('\0') # Remove all 'null' characters from text
outString = "{:.3f}".format(header[5]) + ' ms: ' + outString # Append the timestamp to the beginning of the line
outfile.write(outString)
I don't have much experience with unicode,so I would really appreciate any pointers with this issue!
edit: Using Python 2.7, and below is the entire file. Another thing I should mention is that the code does work when parsing some files, but I think it errors on other files when the timestamp gets too big?
In the main.py file, we call the method LogInterpreter.execute(), and the traceback gives the error shown in the title on the line "outfile.write(outString)", the last line in the dumpTextPacket method which is called in the execute method:
import sys
import os
from struct import unpack
class LogInterpreter:
def __init__( self ):
self.RTCUpdated = False
self.RTCOffset = 0.0
self.LastTimeStamp = 0.0
self.TimerRolloverCount = 0
self.ThisTimeStamp = 0.0
self.m_RTCSeconds = 0.0
self.m_StartTimeInSec = 0.0
def GetRTCOffset( self ):
return self.m_RTCSeconds - self.m_StartTimeInSec
def convertTimeStamp(self,uTime,LogRev):
TicsPerSecond = 24000000.0
self.ThisTimeStamp = uTime
self.RTCOffset = self.GetRTCOffset()
if int( LogRev ) == 2:
if self.RTCUpdated:
self.LastTimeStamp = 0.0
if self.LastTimeStamp > self.ThisTimeStamp:
self.TimerRolloverCount += 1
self.LastTimeStamp = self.ThisTimeStamp
ULnumber = (-1 & 0xffffffff)
return ((ULnumber/TicsPerSecond)*self.TimerRolloverCount + (uTime/TicsPerSecond) + self.RTCOffset) * 1000.0
##########################################################################
# Information about the header for the current packet we are looking at. #
##########################################################################
def grabHeader(self, bugLog, offset):
'''
s_PktHdrRev1
/*0*/ u16 StartOfPacketMarker; # uShort 2
/*2*/ u16 SizeOfPacket; # uShort 2
/*4*/ u08 LogRev; # uChar 1
/*5*/ u08 bTag; # uChar 1
/*6*/ u16 iSeq; # uShort 2
/*8*/ u32 uTime; # uLong 4
'''
headerSize = 12 # Header size in bytes
bType = 'HHBBHL' # codes for our byte type
bugLog.seek(offset)
data = bugLog.read(headerSize)
if len(data) < headerSize:
print('Error in the format of BBLog file')
sys.exit()
headerArray = unpack(bType, data)
convertedTime = self.convertTimeStamp(headerArray[5],headerArray[2])
headerArray = headerArray[:5] + (convertedTime,)
return headerArray
################################################################
# bTag = 8 or bTag = 16 --> just write the data to LogMsgs.txt #
################################################################
def dumpTextPacket(self, header, bugLog, offset, outfile):
bugLog.seek(offset)
data = bugLog.read( header[1] ) # header[1] = size of the packet
outString = data.decode("utf-8","ignore")
if(header[3] == 8): # Removing ugly characters from packet that has bTag = 8.
outString = outString[1:]
outString = outString.strip('\0') # Remove all 'null' characters from text
outString = "{:.3f}".format(header[5]) + ' ms: ' + outString # Append the timestamp to the beginning of the line
outfile.write(outString)
def execute(self):
path = './Logs/'
for fn in os.listdir(path):
fileName = fn
print fn
if (fileName.endswith(".bin")):
# if(fileName.split('.')[1] == "bin"):
print("Parsing "+fileName)
outfile = open(path+fileName.split('.')[0]+".txt", "w") # Open a file for output
fileSize = os.path.getsize(path+fileName)
packetOffset = 0
with open(path+fileName, 'rb') as bugLog:
while(packetOffset < fileSize):
currHeader = self.grabHeader(bugLog, packetOffset) # Grab the header for the current packet
packetOffset = packetOffset + 12 # Increment the pointer by 12 bytes (size of a header packet)
if currHeader[3]==8 or currHeader[3]==16: # Look at the bTag and see if it is a text packet
self.dumpTextPacket(currHeader, bugLog, packetOffset, outfile)
packetOffset = packetOffset + currHeader[1] # Move on to the next packet by incrementing the pointer by the size of the current packet
outfile.close()
print(fileName+" completed.")
When you add together two strings with one of them being Unicode, Python 2 will coerce the result to Unicode too.
>>> 'a' + u'b'
u'ab'
Since you used data.decode
, outString
will be Unicode.
When you write to a binary file, you must have a byte string. Python 2 will attempt to convert your Unicode string to a byte string, but it uses the most generic codec it has: 'ascii'
. This codec fails on many Unicode characters, specifically those with a codepoint above '\'
. You can encode it yourself with a more capable codec to get around this problem:
outfile.write(outString.encode('utf-8'))
Everything changes in Python 3, which won't let you mix byte strings and Unicode strings nor attempt any automatic conversions.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.