[英]Efficiently displaying a stacked bar graph
There are n possible unique events that can occur at m different times: 在m个不同的时间可能发生n种可能的独特事件:
time event
0 A
1 A C
2 A B
3 A
4 B C
5 B C
6 A
7 B
A tally of how many times an event happened is stored into a set of n vectors of size m : 将事件发生次数的计数存储到大小为m的n个向量的集合中:
A vector: {1,2,3,4,4,4,5,5}
B vector: {0,0,1,1,2,3,3,4}
C vector: {0,1,1,1,2,3,3,3}
What I'm wondering is how I can efficiently display the vectors in the form of a stacked bar graph. 我想知道的是如何以堆叠条形图的形式有效显示向量。 I tried matplotlib (have little python experience) and followed this example: http://matplotlib.org/examples/pylab_examples/bar_stacked.html 我尝试过matplotlib(几乎没有python经验),并遵循以下示例: http : //matplotlib.org/examples/pylab_examples/bar_stacked.html
I did get a bar graph working, but the amount of memory the program uses is too much. 我确实得到了条形图,但是该程序使用的内存量过多。 In my program I had 11 event vectors each of size ~25000. 在我的程序中,我有11个事件向量,每个向量的大小约为25000。 For some reason, the application will use over 5GB of memory. 由于某种原因,该应用程序将使用超过5GB的内存。
Could the issue be the way I wrote the script or is python simply abusing memory? 问题可能是我编写脚本的方式,还是python只是在滥用内存? I'm also open to the idea of using Mathematica or MATLAB if it can do the job better. 如果它可以做得更好,我也愿意使用Mathematica或MATLAB。
EDIT 1 编辑1
Here is some working code: 这是一些工作代码:
#!/usr/bin/env python
# a stacked bar plot with errorbars
import numpy as np
import matplotlib.pyplot as plt
import sys, string, os
# Initialize time count
nTimes = 0
# Initialize event counts
nA = 0
nB = 0
nC = 0
nD = 0
nE = 0
nF = 0
nG = 0
nH = 0
nI = 0
nJ = 0
nK = 0
# Initialize event vectors
A_Vec = []
B_Vec = []
C_Vec = []
D_Vec = []
E_Vec = []
F_Vec = []
G_Vec = []
H_Vec = []
I_Vec = []
J_Vec = []
K_Vec = []
# Check for command-line argument
if (len(sys.argv) < 2):
exit()
# Open file
with open(sys.argv[1]) as infile:
# For every line in the data file...
for line in infile:
# Split up tokens
tokens = line.split(" ")
# Get the current time
cur_time = int(tokens[1])
# Fill in in-between values
for time in range(len(A_Vec),cur_time):
A_Vec.append(nA)
B_Vec.append(nB)
C_Vec.append(nC)
D_Vec.append(nD)
E_Vec.append(nE)
F_Vec.append(nF)
G_Vec.append(nG)
H_Vec.append(nH)
I_Vec.append(nI)
J_Vec.append(nJ)
K_Vec.append(nK)
# Figure add event type and add result
if (tokens[2] == 'A_EVENT'):
nA += 1
elif (tokens[2] == 'B_EVENT'):
nB += 1
elif (tokens[2] == 'C_EVENT'):
nC += 1
elif (tokens[2] == 'D_EVENT'):
nD += 1
elif (tokens[2] == 'E_EVENT'):
nE += 1
elif (tokens[2] == 'F_EVENT'):
nF += 1
elif (tokens[2] == 'G_EVENT'):
nG += 1
elif (tokens[2] == 'H_EVENT'):
nH += 1
elif (tokens[2] == 'I_EVENT'):
nI += 1
elif (tokens[2] == 'J_EVENT'):
nJ += 1
elif (tokens[2] == 'K_EVENT'):
nK += 1
if(cur_time == nTimes):
A_Vec[cur_time] = nA
B_Vec[cur_time] = nB
C_Vec[cur_time] = nC
D_Vec[cur_time] = nD
E_Vec[cur_time] = nE
F_Vec[cur_time] = nF
G_Vec[cur_time] = nG
H_Vec[cur_time] = nH
I_Vec[cur_time] = nI
J_Vec[cur_time] = nJ
K_Vec[cur_time] = nK
else:
A_Vec.append(nA)
B_Vec.append(nB)
C_Vec.append(nC)
D_Vec.append(nD)
E_Vec.append(nE)
F_Vec.append(nF)
G_Vec.append(nG)
H_Vec.append(nH)
I_Vec.append(nI)
J_Vec.append(nJ)
K_Vec.append(nK)
# Update time count
nTimes = cur_time
# Set graph parameters
ind = np.arange(nTimes+1)
width = 1.00
vecs = [A_Vec,B_Vec,C_Vec,D_Vec,E_Vec,F_Vec,G_Vec,H_Vec,I_Vec,J_Vec,K_Vec]
tmp_accum = np.zeros(len(vecs[0]))
# Create bars
pA = plt.bar(ind, A_Vec, color='#848484', edgecolor = "none", width=1)
tmp_accum += vecs[0]
pB = plt.bar(ind, B_Vec, color='#FF0000', edgecolor = "none", width=1, bottom=tmp_accum)
tmp_accum += vecs[1]
pC = plt.bar(ind, C_Vec, color='#04B404', edgecolor = "none", width=1, bottom=tmp_accum)
tmp_accum += vecs[2]
pD = plt.bar(ind, D_Vec, color='#8904B1', edgecolor = "none", width=1, bottom=tmp_accum)
tmp_accum += vecs[3]
pE = plt.bar(ind, E_Vec, color='#FFBF00', edgecolor = "none", width=1, bottom=tmp_accum)
tmp_accum += vecs[4]
pF = plt.bar(ind, F_Vec, color='#FF0080', edgecolor = "none", width=1, bottom=tmp_accum)
tmp_accum += vecs[5]
pG = plt.bar(ind, G_Vec, color='#0404B4', edgecolor = "none", width=1, bottom=tmp_accum)
tmp_accum += vecs[6]
pH = plt.bar(ind, H_Vec, color='#E2A9F3', edgecolor = "none", width=1, bottom=tmp_accum)
tmp_accum += vecs[7]
pI = plt.bar(ind, I_Vec, color='#A9D0F5', edgecolor = "none", width=1, bottom=tmp_accum)
tmp_accum += vecs[8]
pJ = plt.bar(ind, J_Vec, color='#FFFF00', edgecolor = "none", width=1, bottom=tmp_accum)
tmp_accum += vecs[9]
pK = plt.bar(ind, K_Vec, color='#58ACFA', edgecolor = "none", width=1, bottom=tmp_accum)
# Add up event count
nEvents = nA+nB+nC+nD+nE+nF+nG+nH+nI+nJ+nK
print 'nEvents = ' + str(nEvents)
# Add graph labels
plt.title('Events/Time Count')
plt.xlabel('Times')
plt.xticks(np.arange(0, nTimes+1, 1))
plt.ylabel('# of Events')
plt.yticks(np.arange(0,nEvents,1))
plt.legend( (pA[0],pB[0],pC[0],pD[0],pE[0],pF[0],pG[0],pH[0],pI[0],pJ[0],pK[0]), ('A','B','C','D','E','F','G','H','I','J','K') , loc='upper left')
plt.show()
Here is an example input file: 这是一个示例输入文件:
TIME 5 A_EVENT
TIME 6 B_EVENT
TIME 6 C_EVENT
TIME 7 A_EVENT
TIME 7 A_EVENT
TIME 7 D_EVENT
TIME 8 E_EVENT
TIME 8 J_EVENT
TIME 8 A_EVENT
TIME 8 A_EVENT
Here is the result: 结果如下:
The program is executed like so: python tally_events.py input.txt
该程序的执行方式如下: python tally_events.py input.txt
EDIT 2 编辑2
import numpy as np
from itertools import cycle
from collections import defaultdict
from matplotlib import pyplot as plt
import sys, string, os
# Check for command-line argument
if (len(sys.argv) < 2):
exit()
# Get values from input file
d = defaultdict(lambda : [0]*100000)
with open(sys.argv[1], 'r') as infile:
for line in infile:
tokens = line.rstrip().split(" ")
time = int(tokens[1])
event = tokens[2]
d[event][time] += 1
# Get all event keys
names = sorted(d.keys())
# Initialize overall total value
otot = 0
# For every event name
for k in names:
# Reinitalize tot
tot = 0
# For every time for event
for i in range(0,time+1):
tmp = d[k][i]
d[k][i] += tot
tot += tmp
otot += tot
vecs = np.array([d[k] for k in names])
# Plot it
fig = plt.figure()
ax = fig.add_subplot(111)
params = {'edgecolor':'none', 'width':1}
colors = cycle(['#848484', '#FF0000', '#04B404', '#8904B1', '#FFBF00', '#FF0080', '#0404B4', '#E2A9F3', '#A9D0F5', '#FFFF00', '#58ACFA'])
ax.bar(range(100000), vecs[0], facecolor=colors.next(), label=names[0], **params)
for i in range(1, len(vecs)):
ax.bar(range(100000), vecs[i], bottom=vecs[:i,:].sum(axis=0),
facecolor=colors.next(), label=names[i], **params)
ax.set_xticks(range(time+1))
ax.set_yticks(range(otot+1))
ax.legend(loc='upper left')
plt.show()
Given the input data you posted, the plot you posted is wrong. 给定您输入的输入数据,您发布的图是错误的。 For example, 'A_EVENT'
does not appear at TIME 6
, so the gray box at x=6
in your plot shouldn't be there. 例如, 'A_EVENT'
不会在TIME 6
出现,因此您绘图中x=6
处的灰色框不应存在。
Anyway, I had to rewrite the code. 无论如何,我不得不重写代码。 As @tcaswell mentioned, it was painful to read. 正如@tcaswell提到的那样,阅读起来很痛苦。 Here is a simpler version. 这是一个简单的版本。
import numpy as np
from itertools import cycle
from collections import defaultdict
from matplotlib import pyplot as plt
# Get values from 'test.txt'
d = defaultdict(lambda : [0]*10)
with open('test.txt', 'r') as infile:
for line in infile:
tokens = line.rstrip().split(" ")
time = int(tokens[1])
event = tokens[2]
d[event][time] += 1
names = sorted(d.keys())
vecs = np.array([d[k] for k in names])
# Plot it
fig = plt.figure()
ax = fig.add_subplot(111)
params = {'edgecolor':'none', 'width':1}
colors = cycle(['r', 'g', 'b', 'm', 'c', 'Orange', 'Pink'])
ax.bar(range(10), vecs[0], facecolor=colors.next(), label=names[0], **params)
for i in range(1, len(vecs)):
ax.bar(range(10), vecs[i], bottom=vecs[:i,:].sum(axis=0),
facecolor=colors.next(), label=names[i], **params)
ax.set_xticks(range(10))
ax.set_yticks(range(10))
ax.legend(loc='upper left')
plt.show()
which yields the dictionary d
产生字典d
[('A_EVENT', [0, 0, 0, 0, 0, 1, 0, 2, 2, 0]),
('B_EVENT', [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
('D_EVENT', [0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),
('J_EVENT', [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
('C_EVENT', [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
('E_EVENT', [0, 0, 0, 0, 0, 0, 0, 0, 1, 0])]
and the vectors vecs
和向量vecs
[[0 0 0 0 0 1 0 2 2 0]
[0 0 0 0 0 0 1 0 0 0]
[0 0 0 0 0 0 1 0 0 0]
[0 0 0 0 0 0 0 1 0 0]
[0 0 0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 0 0 1 0]]
and the figure 和图
I see, I did not fully grasp the fact that you are trying to make ~1M bars which is very memory intensive. 我知道,我没有完全掌握一个事实,您正在尝试制作〜1M条,这会占用大量内存。 I would suggest something like this: 我建议这样的事情:
import numpy as np
from itertools import izip, cycle
import matplotlib.pyplot as plt
from collections import defaultdict
N = 100
fake_data = {}
for j in range(97, 104):
lab = chr(j)
fake_data[lab] = np.cumsum(np.random.rand(N) > np.random.rand(1))
colors = cycle(['r', 'g', 'b', 'm', 'c', 'Orange', 'Pink'])
# fig, ax = plt.subplots(1, 1, tight_layout=True) # if your mpl is newenough
fig, ax = plt.subplots(1, 1) # other wise
ax.set_xlabel('time')
ax.set_ylabel('counts')
cum_array = np.zeros(N*2 - 1) # to keep track of the bottoms
x = np.vstack([arange(N), arange(N)]).T.ravel()[1:] # [0, 1, 1, 2, 2, ..., N-2, N-2, N-1, N-1]
hands = []
labs = []
for k, c in izip(sorted(fake_data.keys()), colors):
d = fake_data[k]
dd = np.vstack([d, d]).T.ravel()[:-1] # double up the data to match the x values [x0, x0, x1, x1, ... xN-2, xN-1]
ax.fill_between(x, dd + cum_array, cum_array, facecolor=c, label=k, edgecolor='none') # fill the region
cum_array += dd # update the base line
# make a legend entry
hands.append(matplotlib.patches.Rectangle([0, 0], 1, 1, color=c)) # dummy artist
labs.append(k) # label
ax.set_xlim([0, N - 1]) # set the limits
ax.legend(hands, labs, loc=2) #add legend
plt.show() #make sure it shows
for N=100: 对于N = 100:
for N=100000: 对于N = 100000:
This uses ~few hundred megs. 这使用了约数百兆。
As a side note, the data parsing could be be further simplified to this: 作为附带说明,可以将数据解析进一步简化为:
import numpy as np
from itertools import izip
import matplotlib.pyplot as plt
from collections import defaultdict
# this requires you to know a head of time how many times you have
len = 10
d = defaultdict(lambda : np.zeros(len, dtype=np.bool)) # save space!
with open('test.txt', 'r') as infile:
infile.next() # skip the header line
for line in infile:
tokens = line.rstrip().split(" ")
time = int(tokens[0]) # get the time which is the first token
for e in tokens[1:]: # loop over the rest
if len(e) == 0:
pass
d[e][time] = True
for k in d:
d[k] = np.cumsum(d[k])
not strictly tested, but I think it should work. 未经严格测试,但我认为它应该可以工作。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.