我正在研究算术编码和解码算法的自适应实现,并且已经实现了python,但是对于某些字符串,我得到了正确的答案,但对于其他字符串,我得到了正确的答案。
程序首次启动时,会提供一个参数来决定符号概率的更改频率。 例如,如果参数为10,则在发送/接收10个符号之后,概率表会根据到目前为止发送/接收的所有符号进行更改。 因此,域分配也被更改。 最初,我具有1/26概率的均匀分布[az]。
它不适用于“ heloworldheloworld”和许多其他情况。
另外,我已经了解了下溢问题,但是如何解决该问题。
import sys
import random
import string
def encode(encode_str, N):
count = dict.fromkeys(string.ascii_lowercase, 1) # probability table
cdf_range = dict.fromkeys(string.ascii_lowercase, 0)
pdf = dict.fromkeys(string.ascii_lowercase, 0)
low = 0
high = float(1)/float(26)
for key, value in sorted(cdf_range.iteritems()):
cdf_range[key] = [low, high]
low = high
high += float(1)/float(26)
for key, value in sorted(pdf.iteritems()):
pdf[key] = float(1)/float(26)
# for key, value in sorted(cdf_range.iteritems()):
# print key, value
# for key, value in sorted(pdf.iteritems()):
# print key, value
i = 26
lower_bound = 0 # upper bound
upper_bound = 1 # lower bound
u = 0
# go thru every symbol in the string
for sym in encode_str:
i += 1
u += 1
count[sym] += 1
curr_range = upper_bound - lower_bound # current range
upper_bound = lower_bound + (curr_range * cdf_range[sym][1]) # upper_bound
lower_bound = lower_bound + (curr_range * cdf_range[sym][0]) # lower bound
# update cdf_range after N symbols have been read
if (u == N):
u = 0
for key, value in sorted(pdf.iteritems()):
pdf[key] = float(count[key])/float(i)
low = 0
for key, value in sorted(cdf_range.iteritems()):
high = pdf[key] + low
cdf_range[key] = [low, high]
low = high
return lower_bound
def decode(encoded, strlen, every):
decoded_str = ""
count = dict.fromkeys(string.ascii_lowercase, 1) # probability table
cdf_range = dict.fromkeys(string.ascii_lowercase, 0)
pdf = dict.fromkeys(string.ascii_lowercase, 0)
low = 0
high = float(1)/float(26)
for key, value in sorted(cdf_range.iteritems()):
cdf_range[key] = [low, high]
low = high
high += float(1)/float(26)
for key, value in sorted(pdf.iteritems()):
pdf[key] = float(1)/float(26)
lower_bound = 0 # upper bound
upper_bound = 1 # lower bound
k = 0
while (strlen != len(decoded_str)):
for key, value in sorted(pdf.iteritems()):
curr_range = upper_bound - lower_bound # current range
upper_cand = lower_bound + (curr_range * cdf_range[key][1]) # upper_bound
lower_cand = lower_bound + (curr_range * cdf_range[key][0]) # lower bound
if (lower_cand <= encoded < upper_cand):
k += 1
decoded_str += key
if (strlen == len(decoded_str)):
break
upper_bound = upper_cand
lower_bound = lower_cand
count[key] += 1
if (k == every):
k = 0
for key, value in sorted(pdf.iteritems()):
pdf[key] = float(count[key])/float(26+len(decoded_str))
low = 0
for key, value in sorted(cdf_range.iteritems()):
high = pdf[key] + low
cdf_range[key] = [low, high]
low = high
print decoded_str
def main():
count = 10
encode_str = "yyyyuuuuyyyy"
strlen = len(encode_str)
every = 3
encoded = encode(encode_str, every)
decoded = decode(encoded, strlen, every)
if __name__ == '__main__':
main()