![](/img/trans.png)
[英]Create a new column from two columns of a dataframe where rows of each column contains list in string format
[英]create dictionary from a list of strings, where each string contains a code and index
我有一个字符串数据集,其中每个字符串包含 19 个唯一的 3 个字母代码和该代码出现的索引。 大多数数据是冗余的(因为数据集由 5 个几乎相同的数据集组成)。 我将每个条目格式化为一个列表。 我想创建一个字典,指示该代码出现的所有索引。 我试图通过为代码作为字典中的值出现的每个代码创建一个索引列表来做到这一点。 这是我到目前为止所拥有的:
code_and_index = \
[ "ALA 1",
"THR 2",
"GLN 3",
"ALA 4",
"ARG 5",
"ASP 6",
"THR 7",
"THR 276",
"ALA 277",
"ILE 278",
"GLY 279",
"LEU 335",
"GLY 336",
"GLU 337",
"GLN 338",
"PRO 339",
"MET 340",
"ALA 341",
"HIS 342",
"ARG 343",
"PRO 344",
"PRO 345",
"ALA 346",
"THR 347",
"PHE 348",
"GLN 349",
"ALA 350",
"ASN 351",
"LYS 352",
"THR 353",
"ASP 354",
"ASP 355",
"CYS 356",
"SER 357",
"ALA 358",
"MET 359",
"GLY 360",
"ASN 361",
"HIS 362",
"CYS 363",
"SER 364",
"HIS 365",
"VAL 366",
"GLY 367",
"GLY 368",
"PRO 369",
"GLN 370",
"ASP 371",
"LEU 372",
"GLU 373",
"LYS 374",
"THR 375",
"PRO 376",
"ARG 377",
"GLY 378",
"ARG 379",
"GLY 380",
"SER 381",
"PRO 382",
"LEU 383",
"PRO 384",
"PRO 385",
"PRO 386",
"ARG 387",
"GLU 388",
"ALA 389",
"SER 390",
"LEU 391",
"ALA 392",
"TRP 459",
"HIS 460",
"SER 461",
"SER 462",
"ALA 1",
"THR 2",
"GLN 3",
"ALA 4",
"ARG 5",
"ASP 6",
"THR 7",
"THR 276",
"ALA 277",
"ILE 278",
"GLY 279",
"LEU 335",
"GLY 336",
"GLU 337",
"GLN 338",
"PRO 339",
"MET 340",
"ALA 341",
"HIS 342",
"ARG 343",
"PRO 344",
"PRO 345",
"ALA 346",
"THR 347",
"PHE 348",
"GLN 349",
"ALA 350",
"ASN 351",
"LYS 352",
"THR 353",
"ASP 354",
"ASP 355",
"CYS 356",
"SER 357",
"ALA 358",
"MET 359",
"GLY 360",
"ASN 361",
"HIS 362",
"CYS 363",
"SER 364",
"HIS 365",
"VAL 366",
"GLY 367",
"GLY 368",
"PRO 369",
"GLN 370",
"ASP 371",
"LEU 372",
"GLU 373",
"LYS 374",
"THR 375",
"PRO 376",
"ARG 377",
"GLY 378",
"ARG 379",
"GLY 380",
"SER 381",
"PRO 382",
"LEU 383",
"PRO 384",
"PRO 385",
"PRO 386",
"ARG 387",
"GLU 388",
"ALA 389",
"SER 390",
"LEU 391",
"ALA 392",
"TRP 459",
"HIS 460",
"SER 461",
"SER 462",
"ALA 1",
"THR 2",
"GLN 3",
"ALA 4",
"ARG 5",
"ASP 6",
"THR 7",
"THR 276",
"ALA 277",
"ILE 278",
"GLY 279",
"LEU 335",
"GLY 336",
"GLU 337",
"GLN 338",
"PRO 339",
"MET 340",
"ALA 341",
"HIS 342",
"ARG 343",
"PRO 344",
"PRO 345",
"ALA 346",
"THR 347",
"PHE 348",
"GLN 349",
"ALA 350",
"ASN 351",
"LYS 352",
"THR 353",
"ASP 354",
"ASP 355",
"CYS 356",
"SER 357",
"ALA 358",
"MET 359",
"GLY 360",
"ASN 361",
"HIS 362",
"CYS 363",
"SER 364",
"HIS 365",
"VAL 366",
"GLY 367",
"GLY 368",
"PRO 369",
"GLN 370",
"ASP 371",
"LEU 372",
"GLU 373",
"LYS 374",
"THR 375",
"PRO 376",
"ARG 377",
"GLY 378",
"ARG 379",
"GLY 380",
"SER 381",
"PRO 382",
"LEU 383",
"PRO 384",
"PRO 385",
"PRO 386",
"ARG 387",
"GLU 388",
"ALA 389",
"SER 390",
"LEU 391",
"ALA 392",
"HIS 460",
"SER 461",
"SER 462",
"ALA 1",
"THR 2",
"GLN 3",
"ALA 4",
"ARG 5",
"ASP 6",
"THR 7",
"THR 276",
"ALA 277",
"ILE 278",
"GLY 279",
"LEU 335",
"GLY 336",
"GLU 337",
"GLN 338",
"PRO 339",
"MET 340",
"ALA 341",
"HIS 342",
"ARG 343",
"PRO 344",
"PRO 345",
"ALA 346",
"THR 347",
"PHE 348",
"GLN 349",
"ALA 350",
"ASN 351",
"LYS 352",
"THR 353",
"ASP 354",
"ASP 355",
"CYS 356",
"SER 357",
"ALA 358",
"MET 359",
"GLY 360",
"ASN 361",
"HIS 362",
"CYS 363",
"SER 364",
"HIS 365",
"VAL 366",
"GLY 367",
"GLY 368",
"PRO 369",
"GLN 370",
"ASP 371",
"LEU 372",
"GLU 373",
"LYS 374",
"THR 375",
"PRO 376",
"ARG 377",
"GLY 378",
"ARG 379",
"GLY 380",
"SER 381",
"PRO 382",
"LEU 383",
"PRO 384",
"PRO 385",
"PRO 386",
"ARG 387",
"GLU 388",
"ALA 389",
"SER 390",
"LEU 391",
"ALA 392",
"HIS 460",
"SER 461",
"SER 462",
"ALA 1",
"THR 2",
"GLN 3",
"ALA 4",
"ARG 5",
"ASP 6",
"THR 7",
"THR 276",
"ALA 277",
"ILE 278",
"GLY 279",
"LEU 335",
"GLY 336",
"GLU 337",
"GLN 338",
"PRO 339",
"MET 340",
"ALA 341",
"HIS 342",
"ARG 343",
"PRO 344",
"PRO 345",
"ALA 346",
"THR 347",
"PHE 348",
"GLN 349",
"ALA 350",
"ASN 351",
"LYS 352",
"THR 353",
"ASP 354",
"ASP 355",
"CYS 356",
"SER 357",
"ALA 358",
"MET 359",
"GLY 360",
"ASN 361",
"HIS 362",
"CYS 363",
"SER 364",
"HIS 365",
"VAL 366",
"GLY 367",
"GLY 368",
"PRO 369",
"GLN 370",
"ASP 371",
"LEU 372",
"GLU 373",
"LYS 374",
"THR 375",
"PRO 376",
"ARG 377",
"GLY 378",
"ARG 379",
"GLY 380",
"SER 381",
"PRO 382",
"LEU 383",
"PRO 384",
"PRO 385",
"PRO 386",
"ARG 387",
"GLU 388",
"ALA 389",
"SER 390",
"LEU 391",
"ALA 392",
"TRP 459",
"HIS 460",
"SER 461",
"SER 462" ]
code_string = "" # make string of codes then turn it into array after string is fed by loop
for i in code_and_index:
code_indices_split = i.split()
#print(f" the big split {acid_indices_split}")
print()
three_letter_code = code_indices_split[0]
code_string += three_letter_code + " "
array_of_codes = code_string.split()
print()
dictionary_of_codes_and_indices = {}.fromkeys(array_of_codes)
print(dictionary_of_codes_and_indices)
for string in code_and_index :
split_string = string.split()
for code in dictionary_of_codes_and_indices.keys():
if split_string[0] == code:
dictionary_of_codes_and_indices[ split_string[0]] = [split_string[1] for string in dictionary_of_codes_and_indices]
print(f" dictionary, without index values assigned {dictionary_of_codes_and_indices}")
print()
for x ,y in dictionary_of_codes_and_indices.items():
print(x,y)
但是出于某种原因,当我尝试使用列表理解来分配字典的值时,我只能得到 1 个代码反复出现的索引。
ALA ['392', '392', '392', '392', '392', '392', '392', '392', '392', '392', '392', '392', '392', '392', '392', '392', '392', '392', '392']
THR ['375', '375', '375', '375', '375', '375', '375', '375', '375', '375', '375', '375', '375', '375', '375', '375', '375', '375', '375']
GLN ['370', '370', '370', '370', '370', '370', '370', '370', '370', '370', '370', '370', '370', '370', '370', '370', '370', '370', '370']
ARG ['387', '387', '387', '387', '387', '387', '387', '387', '387', '387', '387', '387', '387', '387', '387', '387', '387', '387', '387']
ASP ['371', '371', '371', '371', '371', '371', '371', '371', '371', '371', '371', '371', '371', '371', '371', '371', '371', '371', '371']
ILE ['278', '278', '278', '278', '278', '278', '278', '278', '278', '278', '278', '278', '278', '278', '278', '278', '278', '278', '278']
GLY ['380', '380', '380', '380', '380', '380', '380', '380', '380', '380', '380', '380', '380', '380', '380', '380', '380', '380', '380']
LEU ['391', '391', '391', '391', '391', '391', '391', '391', '391', '391', '391', '391', '391', '391', '391', '391', '391', '391', '391']
GLU ['388', '388', '388', '388', '388', '388', '388', '388', '388', '388', '388', '388', '388', '388', '388', '388', '388', '388', '388']
PRO ['386', '386', '386', '386', '386', '386', '386', '386', '386', '386', '386', '386', '386', '386', '386', '386', '386', '386', '386']
MET ['359', '359', '359', '359', '359', '359', '359', '359', '359', '359', '359', '359', '359', '359', '359', '359', '359', '359', '359']
HIS ['460', '460', '460', '460', '460', '460', '460', '460', '460', '460', '460', '460', '460', '460', '460', '460', '460', '460', '460']
PHE ['348', '348', '348', '348', '348', '348', '348', '348', '348', '348', '348', '348', '348', '348', '348', '348', '348', '348', '348']
ASN ['361', '361', '361', '361', '361', '361', '361', '361', '361', '361', '361', '361', '361', '361', '361', '361', '361', '361', '361']
LYS ['374', '374', '374', '374', '374', '374', '374', '374', '374', '374', '374', '374', '374', '374', '374', '374', '374', '374', '374']
CYS ['363', '363', '363', '363', '363', '363', '363', '363', '363', '363', '363', '363', '363', '363', '363', '363', '363', '363', '363']
SER ['462', '462', '462', '462', '462', '462', '462', '462', '462', '462', '462', '462', '462', '462', '462', '462', '462', '462', '462']
VAL ['366', '366', '366', '366', '366', '366', '366', '366', '366', '366', '366', '366', '366', '366', '366', '366', '366', '366', '366']
TRP ['459', '459', '459', '459', '459', '459', '459', '459', '459', '459', '459', '459', '459', '459', '459', '459', '459', '459', '459']
我觉得我已经完成了 80% 的事情来获得我想要的东西,但由于我的经验不足,我错过了一些东西。
我会看看你写的东西,但现在,试试下面的代码,看看这是否是你想要的。
import re
temp_dict = {}
for string in code_and_index:
code, index = re.sub(r"\s+", " ", string).split(' ')
try:
temp_dict[code].append(index)
except KeyError:
temp_dict[code] = [index]
编辑以处理重复值:
import re
temp_dict = {}
for string in code_and_index:
code, index = re.sub(r"\s+", " ", string).split(' ')
try:
if index not in temp_dict[code]:
temp_dict[code].append(index)
except KeyError:
temp_dict[code] = [index]
您代码中的问题在这一行:
dictionary_of_codes_and_indices[ split_string[0]] = [split_string[1] for string in dictionary_of_codes_and_indices]
您在每次迭代时定义由相同元素 split_string[1] 组成的列表推导式,每次都重新定义它。 这就是为什么你的最终字典中只有最后一个元素。 您应该将值设置为空列表而不是使用列表理解,但是您所做的有点令人费解,甚至通过更改那里我会得到一些错误。
如果你愿意,你可以按照以下步骤进行:
final_dict = {}
for c in code_and_index:
# extract code and index
code = c.split()[0]
ind = c.split()[1]
# create key in dictionary every fist time a code is seen
if code not in final_dict.keys():
final_dict[code] = []
# append index in list inside dictionary if element not yet present
if ind not in final_dict[code]:
final_dict[code].append(ind)
出去:
{'ALA': ['1', '4', '277', '341', '346', '350', '358', '389', '392'],
'THR': ['2', '7', '276', '347', '353', '375'],
'GLN': ['3', '338', '349', '370'],
'ARG': ['5', '343', '377', '379', '387'],
'ASP': ['6', '354', '355', '371'],
'ILE': ['278'],
'GLY': ['279', '336', '360', '367', '368', '378', '380'],
'LEU': ['335', '372', '383', '391'],
'GLU': ['337', '373', '388'],
'PRO': ['339', '344', '345', '369', '376', '382', '384', '385', '386'],
'MET': ['340', '359'],
'HIS': ['342', '362', '365', '460'],
'PHE': ['348'],
'ASN': ['351', '361'],
'LYS': ['352', '374'],
'CYS': ['356', '363'],
'SER': ['357', '364', '381', '390', '461', '462'],
'VAL': ['366'],
'TRP': ['459']}
试试这个代码:
newCleanData = {}
for item in code_and_index:
code = item[:3]
index = item[5:]
if code in newCleanData:
tempList = newCleanData[code]
tempList.append(index)
tempList = sorted(list(set(tempList)))
newCleanData[code] = tempList
else:
newCleanData[code] = [index]
#print(newCleanData)
for code, index in newCleanData.items():
print(code, index)
输出:
ALA ['1', '4', '277', '341', '346', '350', '358', '389', '392']
THR ['2', '7', '276', '347', '353', '375']
GLN ['3', '338', '349', '370']
ARG ['5', '343', '377', '379', '387']
ASP ['6', '354', '355', '371']
ILE ['278']
GLY ['279', '336', '360', '367', '368', '378', '380']
LEU ['335', '372', '383', '391']
GLU ['337', '373', '388']
PRO ['339', '344', '345', '369', '376', '382', '384', '385', '386']
MET ['340', '359']
HIS ['342', '362', '365', '460']
PHE ['348']
ASN ['351', '361']
LYS ['352', '374']
CYS ['356', '363']
SER ['357', '364', '381', '390', '461', '462']
VAL ['366']
TRP ['459']
灵魂是
result = {}
for token in code_and_index:
name, value = token.split()
result.setdefault(name, list()).append(value)
for x ,y in result.items():
print(x,y)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.