繁体   English   中英

在python中使用networkx创建图形和执行链接预测时出错

[英]Error in creating graph and performing link prediction using networkx in python

我正在尝试使用 csv 文件制作图表,该文件包含有关节点的边缘、职业和年龄的信息。 我为每个节点分配社区并执行链接预测。

import networkx as nx
import csv
engineers1 = []
engineers2 = []
engineers3 = []
engineers4 = []
engineers5 = []
actors1= []
actors2= []
actors3= []
actors4= []
actors5= []
writers1 = []
writers2= []
writers3= []
writers4 = []
writers5 = []
doctors1= []
doctors2= []
doctors3= []
doctors4= []
doctors5= []
drivers1=[]
drivers2=[]
drivers3=[]
drivers4=[]
drivers5=[]
teachers1=[]
teachers2=[]
teachers3=[]
teachers4=[]
teachers5=[]
nodes=[]
g=nx.Graph()

for i in range(0,4038):
    g.add_node(i)

with open("asd1.csv",'r') as csv_file:
    csv_reader=csv.DictReader(csv_file)

    for line in csv_reader:
        g.add_edge(line['first'],line['second'])

csv_file.close()

with open("asd1.csv",'r') as csv_file:
    csv_reader=csv.DictReader(csv_file)
    for line in csv_reader:
         if (line['profession'] == 'actor' and line['age'] >= '13' and 
line['age'] <= '17'):
            actors1.append(line['name'])
        if (line['profession'] == 'actor' and line['age'] >= '18' and 
line['age'] <= '29'):
          actors2.append(line['name'])
        if (line['profession'] == 'actor' and line['age'] >= '30' and 
line['age'] <= '49'):
        actors3.append(line['name'])
    if (line['profession'] == 'actor' and line['age'] >= '50' and line['age'] <= '64'):
        actors4.append(line['name'])
    if (line['profession'] == 'actor' and line['age'] >= '65'):
        actors5.append(line['name'])

    if (line['profession'] == 'eng' and line['age'] >= '13' and line['age'] <= '17'):
        engineers1.append(line['name'])
    if (line['profession'] == 'eng' and line['age'] >= '18' and line['age'] <= '29'):
        engineers2.append(line['name'])
    if (line['profession'] == 'eng' and line['age'] >= '30' and line['age'] <= '49'):
        engineers3.append(line['name'])
    if (line['profession'] == 'eng' and line['age'] >= '50' and line['age'] <= '64'):
        engineers4.append(line['name'])
    if (line['profession'] == 'eng' and line['age'] >= '65'):
        engineers5.append(line['name'])

    if (line['profession'] == 'teacher' and line['age'] >= '13' and line['age'] <= '17'):
        teachers1.append(line['name'])
    if (line['profession'] == 'teacher' and line['age'] >= '18' and line['age'] <= '29'):
        teachers2.append(line['name'])
    if (line['profession'] == 'teacher' and line['age'] >= '30' and line['age'] <= '49'):
        teachers3.append(line['name'])
    if (line['profession'] == 'teacher' and line['age'] >= '50' and line['age'] <= '64'):
        teachers4.append(line['name'])
    if (line['profession'] == 'teacher' and line['age'] >= '65'):
        teachers5.append(line['name'])

    if (line['profession'] == 'driver' and line['age'] >= '13' and line['age'] <= '17'):
        drivers1.append(line['name'])
    if (line['profession'] == 'driver' and line['age'] >= '18' and line['age'] <= '29'):
        drivers2.append(line['name'])
    if (line['profession'] == 'driver' and line['age'] >= '30' and line['age'] <= '49'):
        drivers3.append(line['name'])
    if (line['profession'] == 'driver' and line['age'] >= '50' and line['age'] <= '64'):
        doctors4.append(line['name'])
    if (line['profession'] == 'driver' and line['age'] >= '65'):
        drivers5.append(line['name'])

    if (line['profession'] == 'doctor' and line['age'] >= '13' and line['age'] <= '17'):
        doctors1.append(line['name'])
    if (line['profession'] == 'doctor' and line['age'] >= '18' and line['age'] <= '29'):
        doctors2.append(line['name'])
    if (line['profession'] == 'doctor' and line['age'] >= '30' and line['age'] <= '49'):
        doctors3.append(line['name'])
    if (line['profession'] == 'doctor' and line['age'] >= '50' and line['age'] <= '64'):
        drivers4.append(line['name'])
    if (line['profession'] == 'doctor' and line['age'] >= '65'):
        doctors5.append(line['name'])

csv_file.close()

print("actors having age between 13 and 17: ",actors1) 
print("actors having age between 18 and 29: ",actors2)
print("actors having age between 30 and 49: ",actors3) 
print("actors having age between 50 and 64: ",actors4)
print("actors having age 65 and above: ",actors5)
print('\n')

print("engineers having age between 13 and 17: ",engineers1)
print("engineers having age between 18 and 29: ",engineers2)
print("engineers having age between 30 and 49: ",engineers3)
print("engineers having age between 50 and 64: ",engineers4)
print("engineers having age 65 and above: ",engineers5)
print('\n')

print("teachers having age between 13 and 17: ",teachers1)
print("teachers having age between 18 and 29: ",teachers2)
print("teachers having age between 30 and 49: ",teachers3)
print("teachers having age between 50 and 64: ",teachers4)
print("teachers having age 65 and above: ",teachers5)
print('\n')

print("drivers having age between 13 and 17: ",drivers1)
print("drivers having age between 18 and 29: ",drivers2)
print("drivers having age between 30 and 49: ",drivers3)
print("drivers having age between 50 and 64: ",drivers4)
print("drivers having age 65 and above: ",drivers5)
print('\n')

print("doctors having age between 13 and 17: ",doctors1)
print("doctors having age between 18 and 29: ",doctors2)
print("doctors having age between 30 and 49: ",doctors3)
print("doctors having age between 50 and 64: ",doctors4)
print("doctors having age 65 and above: ",doctors5)
print('\n')

for i in range(0,4038):
    g.node[i]['community']=0

for x1 in actors1:
    g.node[x1]['community']=0
for x2 in actors2:
    g.node[x2]['community']=1 
for x3 in actors3:
    g.node[x3]['community']=2
for x4 in actors4:
    g.node[x4]['community']=3
for x5 in actors5:
    g.node[x5]['community']=4
for x6 in engineers1:
    g.node[x6]['community']=5
for x7 in engineers2:
    g.node[x7]['community']=6
for x8 in engineers3:
    g.node[x8]['community']=7
for x9 in engineers4:
    g.node[x9]['community']=8
for x10 in engineers5:
    g.node[x10]['community']=9
for x11 in teachers1:
    g.node[x11]['community']=10
for x12 in teachers2:
    g.node[x12]['community']=11
for x13 in teachers3:
    g.node[x13]['community']=12
for x14 in teachers4:
    g.node[x14]['community']=13
for x15 in teachers5:
    g.node[x15]['community']=14
for x16 in drivers1:
    g.node[x16]['community']=15
for x17 in drivers2:
    g.node[x17]['community']=16
for x18 in drivers3:
    g.node[x18]['community']=17
for x19 in drivers4:
    g.node[x19]['community']=18
for x20 in drivers5:
    g.node[x20]['community']=19
for x21 in doctors1:
    g.node[x21]['community']=20
for x22 in doctors2:
   g.node[x22]['community']=21
for x23 in doctors3:
    g.node[x23]['community']=22
for x24 in doctors4:
    g.node[x24]['community']=23
for x25 in doctors5:
    g.node[x25]['community']=24

print(g.nodes())
l=list(nx.cn_soundarajan_hopcroft(g))
print(l)

序幕

强烈建议您阅读任何解释算法的优秀编程书籍。 你的问题可以用几行代码来解决。

第一幕

看看你的问题。 您有多个职业、多个年龄组和名称作为唯一标识符。 并且您想将它们彼此区别开来。 现在看看你的代码。 为了解决您的问题,您正在为每个年龄-职业组合创建唯一的列表。 它是可以创建的最少可修改的结构。 如果您必须添加另外五个职业(有数千种不同的职业),您将不得不将代码加倍。 此外,您在复制粘贴时很容易出错。 只需一个普通的merchandiser3代替merchandiser4就可以让您在接下来的一两个小时内陷入红眼地狱。 看,您的代码中已经有错误了!

if (line['profession'] == 'doctor' and line['age'] >= '13' and line['age'] <= '17'):
    doctors1.append(line['name'])
if (line['profession'] == 'doctor' and line['age'] >= '18' and line['age'] <= '29'):
    doctors2.append(line['name'])
if (line['profession'] == 'doctor' and line['age'] >= '30' and line['age'] <= '49'):
    doctors3.append(line['name'])
if (line['profession'] == 'doctor' and line['age'] >= '50' and line['age'] <= '64'):
    # Hello, guys! I am ready to torture his brain and eyes for hours!!
    drivers4.append(line['name'])
if (line['profession'] == 'doctor' and line['age'] >= '65'):
    doctors5.append(line['name'])

而且,作为头脑中的最后一击,您真的不需要所有这些列表。 例如,您可以为每个职业创建一个字典。 或者是其他东西。 但是您可以注意到,您的数据对于每个人都具有非常重复的模式。 姓名、年龄、职业……等等,我们把数据带到哪里去了? CSV 文件? 什么是 CSV 文件?

是的。

桌子。

法案 2

如果从表中读取数据,最好将此数据存储在表中! (嗯,大部分时间......)Python 有一个惊人的表格库 - Pandas。 您所有的数百行都可以减少到一二打! 现在仔细看看我的手,魔法开始了……

零。 我们进口熊猫:

import pandas as pd

第一的。 我们为年龄聚类创建了单独的函数。 如果我们的大老板让我们处理 11 岁的神经科学家,我们将做好充分准备:

def get_age_cluster(age):
    a = int(age)
    if a >= 0 and a <= 12:
        return '<13'
    if a >= 13 and a <= 17:
        return '13-17'
    if a >= 18 and a <= 29:
        return '18-29'
    if a >= 30 and a <= 49:
        return '30-49'
    if a >= 50 and a <= 64:
        return '50-64'
    elif a >= 65:
        return '>64'

第二。 我们阅读了 CSV。 您正在手动进行,逐行处理,处理每种可能的组合......为什么?! 这是一个常见的操作! 早就有人写了! 偷懒!

(这是我老老师的忠告,我多年藏在心里!笑话。我没有心。)

df=pd.read_csv('TF.csv')

是的,这就是全部。 是的。 真的。 一条线。 二十四个符号(记住这个数字!!)。 现在让我们和我们的十个小可爱成为朋友:

==============================

我们刚刚加载了 CSV,但我们没有转换age列。 它包含年龄,但应该包含集群。 不是问题!

df['age'] = df['age'].apply(get_age_cluster)

完毕! 您可以将任何转换函数应用于表中的行或列。 因此,我们不需要对年龄进行排序、对年龄进行排序、对年龄进行排序、对 aegs 进行排序以及...我们只需编写一个漂亮的单行。 结果如下:

================================

您可以注意到我们有一些垃圾列。 不是问题!

df = df.drop('waka', axis=1) df = df.drop('we_dont_need_this_column', axis=1)

我们有一张漂亮的小桌子:

========================

现在进入主要任务。 根据每个职业和年龄获取所有名称。 Pandas 具有许多分组功能。 让我们使用最简单的:

grouped = df.groupby(['profession', 'age'])
for group in grouped.groups:
    print(group, list(grouped.get_group(group)['name']))

我们得到带有职业年龄组的分组结构: grouped = df.groupby(['profession', 'age']) ,对于这个结构中的每个组: for group in grouped.groups:我们打印: print()每个组中列 'name' 的列表: grouped.get_group(group)['name']) 结果如下:

('eng', '30-49') ['Cthulhu']
('driver', '18-29') ['John Doe 3']
('actor', '13-17') ['John Doe 4']
('actor', '18-29') ['Yog-Sothoth']
('teacher', '18-29') ['John Doe 2', 'Shub-Niggurath']
('eng', '>64') ['Fblthp the Lost']
('driver', '<13') ['Azathoth']
('doctor', '18-29') ['Nyarlathotep']
('doctor', '30-49') ['John Doe 1']

这是整个代码:

import pandas as pd

def get_age_cluster(age):
    a = int(age)
    if a >= 0 and a <= 12:
        return '<13'
    if a >= 13 and a <= 17:
        return '13-17'
    if a >= 18 and a <= 29:
        return '18-29'
    if a >= 30 and a <= 49:
        return '30-49'
    if a >= 50 and a <= 64:
        return '50-64'
    elif a >= 65:
        return '>64'

df=pd.read_csv('TF.csv')
df['age'] = df['age'].apply(get_age_cluster)
df = df.drop('waka', axis=1)
df = df.drop('we_dont_need_this_column', axis=1)
grouped = df.groupby(['profession', 'age'])
for group in grouped.groups:
    print(group, list(grouped.get_group(group)['name']))

二十四行。 我想我们现在可以称自己为神奇二十四了。 它就像神奇四侠,但神奇二十四。 但是我们的 Graph Doom 仍然存在……

第 3 条

我们创建了表格,做了一些转换,对它进行了排序和过滤。 但是你还有另一个问题——图表。 而这个问题比第一个更难。

您正在从一个文件中读取节点(人类)和边缘(我不知道究竟是什么。关系?)。 它迫使你的图有很强的限制——节点数等于边数。 这是非常罕见的情况。 我认为您在开始编写此脚本之前做错了什么。 我建议您为节点和边使用不同的文件(或至少在一个文件中使用不同的部分)。 但! 让我们假设你正在做你想做的事,每个人(当​​然还有克苏鲁!)只有一个优势。 在这种情况下,我们可以只用两行代码构建我们的图:

G = nx.Graph()
G.add_edges_from(df[['first', 'second']].values)

答对了! 我们完了。 现在让我们得到这个奇怪的复杂的东西:

设置每个节点的社区(注意,您需要它用于算法):

for n in G.nodes:
    G.nodes[n]['community'] = 0

并计算这个:

csh = nx.cn_soundarajan_hopcroft(G)

我们得到了一个迭代器。 将其转换为列表并得到结果:

[(1, 8, 2),
 (1, 9, 0),
 (1, 2, 4),
 (1, 4, 0),
 (1, 6, 2),
 (2, 8, 2),
 (2, 9, 2),
 (2, 5, 0),
 (2, 6, 2),
 (3, 9, 0),
 (3, 4, 2),
 (3, 5, 2),
 (3, 6, 0),
 (3, 7, 4),
 (4, 8, 0),
 (4, 5, 2),
 (4, 7, 2),
 (5, 8, 0),
 (5, 9, 0),
 (5, 7, 2),
 (6, 8, 0),
 (6, 9, 2),
 (6, 7, 0),
 (7, 8, 0),
 (7, 9, 0),
 (8, 9, 0)]

总决赛

我希望你喜欢我为你写的小音乐 :) 我推荐你写一些好的 Python 编程书和算法编程书。 祝你好运!

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM