[英]Iterative connected components algorithm
我有一個二分圖,我正在尋找最有效的迭代方法將其划分為連接的組件。 我的遞歸版本已經開始在大數據集上溢出堆棧。 我願意從任何語言/偽代碼移植,但為了完整起見,我將編碼為 C#。
我現有的代碼專用於我的數據類型。 一個分區是蛋白質,另一個是光譜。 Map 和 Set 是 C++ stdlib workalikes。
void recursivelyAssignProteinToCluster (long proteinId,
long clusterId,
Set<long> spectrumSet,
Map<long, Set<long>> spectrumSetByProteinId,
Map<long, Set<long>> proteinSetBySpectrumId,
Map<long, long> clusterByProteinId)
{
// try to assign the protein to the current cluster
var insertResult = clusterByProteinId.Insert(proteinId, clusterId);
if (!insertResult.WasInserted)
return;
// recursively add all "cousin" proteins to the current cluster
foreach (long spectrumId in spectrumSet)
foreach (var cousinProteinId in proteinSetBySpectrumId[spectrumId])
{
if (proteinId != cousinProteinId)
{
Set<long> cousinSpectrumSet = spectrumSetByProteinId[cousinProteinId];
recursivelyAssignProteinToCluster(cousinProteinId,
clusterId,
cousinSpectrumSet,
spectrumSetByProteinId,
proteinSetBySpectrumId,
clusterByProteinId);
}
}
}
Map<long, long> calculateProteinClusters (NHibernate.ISession session)
{
var spectrumSetByProteinId = new Map<long, Set<long>>();
var proteinSetBySpectrumId = new Map<long, Set<long>>();
var query = session.CreateQuery("SELECT pi.Protein.id, psm.Spectrum.id " + GetFilteredQueryString(FromProtein, ProteinToPeptideSpectrumMatch));
foreach (var queryRow in query.List<object[]>())
{
long proteinId = (long) queryRow[0];
long spectrumId = (long) queryRow[1];
spectrumSetByProteinId[proteinId].Add(spectrumId);
proteinSetBySpectrumId[spectrumId].Add(proteinId);
}
var clusterByProteinId = new Map<long, long>();
int clusterId = 0;
foreach (var pair in spectrumSetByProteinId)
{
long proteinId = pair.Key;
// for each protein without a cluster assignment, make a new cluster
if (!clusterByProteinId.Contains(proteinId))
{
++clusterId;
recursivelyAssignProteinToCluster(proteinId,
clusterId,
pair.Value,
spectrumSetByProteinId,
proteinSetBySpectrumId,
clusterByProteinId);
}
}
return clusterByProteinId;
}
正如 ShinTakezou 建議的那樣,我重構了將堆棧放在堆上的方法,效果很好。 我使用了 digEmAll 示例中的 DepthFirstSearch 方法。
var clusterByProteinId = new Map<long, long>();
int clusterId = 0;
var clusterStack = new Stack<KeyValuePair<long, Set<long>>>();
foreach (var pair in spectrumSetByProteinId)
{
long proteinId = pair.Key;
if (clusterByProteinId.Contains(proteinId))
continue;
// for each protein without a cluster assignment, make a new cluster
++clusterId;
clusterStack.Push(new KeyValuePair<long, Set<long>>(proteinId, spectrumSetByProteinId[proteinId]));
while (clusterStack.Count > 0)
{
var kvp = clusterStack.Pop();
// try to assign the protein to the current cluster
var insertResult = clusterByProteinId.Insert(kvp.Key, clusterId);
if (!insertResult.WasInserted)
continue;
// add all "cousin" proteins to the current cluster
foreach (long spectrumId in kvp.Value)
foreach (var cousinProteinId in proteinSetBySpectrumId[spectrumId])
if (!clusterByProteinId.Contains(cousinProteinId))
clusterStack.Push(new KeyValuePair<long, Set<long>>(cousinProteinId, spectrumSetByProteinId[cousinProteinId]));
}
}
這是一個 helper class 的示例,它包含一個無向圖並允許獲取它的連接組件(迭代):
public class Graph<T>
{
public Dictionary<T, HashSet<T>> nodesNeighbors;
public IEnumerable<T> Nodes
{
get { return nodesNeighbors.Keys; }
}
public Graph()
{
this.nodesNeighbors = new Dictionary<T, HashSet<T>>();
}
public void AddNode(T node)
{
this.nodesNeighbors.Add(node, new HashSet<T>());
}
public void AddNodes(IEnumerable<T> nodes)
{
foreach (var n in nodes)
this.AddNode(n);
}
public void AddArc(T from, T to)
{
this.nodesNeighbors[from].Add(to);
this.nodesNeighbors[to].Add(from);
}
public bool ContainsNode(T node)
{
return this.nodesNeighbors.ContainsKey(node);
}
public IEnumerable<T> GetNeighbors(T node)
{
return nodesNeighbors[node];
}
public IEnumerable<T> DepthFirstSearch(T nodeStart)
{
var stack = new Stack<T>();
var visitedNodes = new HashSet<T>();
stack.Push(nodeStart);
while (stack.Count > 0)
{
var curr = stack.Pop();
if (!visitedNodes.Contains(curr))
{
visitedNodes.Add(curr);
yield return curr;
foreach (var next in this.GetNeighbors(curr))
{
if (!visitedNodes.Contains(next))
stack.Push(next);
}
}
}
}
public Graph<T> GetSubGraph(IEnumerable<T> nodes)
{
Graph<T> g = new Graph<T>();
g.AddNodes(nodes);
foreach (var n in g.Nodes.ToList())
{
foreach (var neigh in this.GetNeighbors(n))
g.AddArc(n, neigh);
}
return g;
}
public IEnumerable<Graph<T>> GetConnectedComponents()
{
var visitedNodes = new HashSet<T>();
var components = new List<Graph<T>>();
foreach (var node in this.Nodes)
{
if (!visitedNodes.Contains(node))
{
var subGraph = GetSubGraph(this.DepthFirstSearch(node));
components.Add(subGraph);
visitedNodes.UnionWith(subGraph.Nodes);
}
}
return components;
}
}
用法:
static void Main(string[] args)
{
var g = new Graph<long>();
g.AddNodes(new long[] { 1, 2, 3, 4, 5, 6, 7, 8, 9 });
g.AddArc(1, 2);
g.AddArc(1, 3);
g.AddArc(9, 6);
g.AddArc(6, 7);
g.AddArc(6, 8);
g.AddArc(4, 5);
var subGraphs = g.GetConnectedComponents();
}
您可以使用Graph<>
class 而不是地圖,或者如果您想堅持使用地圖,請查看非常容易理解的代碼(在 class 中使用Dictionary<T,HashSet<T>>
保存節點和弧,所以與您的方法非常相似)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.