[英]arangodb Facet calculation / aggregation slow?
我不知道為什么以下方面計算如此緩慢:
FOR q IN LRQ
COLLECT profile = q.LongRunningQuery.Profile INTO profiles
RETURN { "Profile" : profile, "Count" : LENGTH(profiles)}
盡管大約只有5,000個文檔在數據庫中,並且結果只有30個不同的方面,但這大約需要30秒。
LongHunningQuery.Profile字段使用哈希索引和跳過列表索引進行索引。 (我也嘗試過使用它們的不同組合)。
有沒有人給我提示可能出什么問題? 查詢是否可能無法從索引中受益? (5,000條記錄的大小約為1 GB,因此我假設將不使用哈希索引,而可能使用全表掃描嗎?)
有趣的是,以下替代方案僅持續2秒:
FOR q IN SKIPLIST(LRQ, { "LongRunningQuery.Profile": [ [ '>', '' ] ] })[*].LongRunningQuery.Profile
COLLECT profile = q INTO profiles
RETURN { "Profile" : profile, "Count" : LENGTH(profiles) }
但是對於如此少量的記錄,它仍然需要2秒。 在這里看起來好像使用了跳過列表索引,但它可能不是完美的索引變體。
更新2014-11-27:
arangosh [_system]> stmt._query
FOR q IN LRQ COLLECT profile = q.LongRunningQuery.Profile INTO profiles RETURN {
"Profile" : profile, "Count" : LENGTH(profiles)}
arangosh [_system]> db.LRQ.ensureHashIndex("LongRunningQuery.Profile");
{
"id" : "LRQ/296017913256",
"type" : "hash",
"unique" : false,
"fields" : [
"LongRunningQuery.Profile"
],
"isNewlyCreated" : false,
"error" : false,
"code" : 200
}
查詢耗時約32秒,返回31個簡短結果。
執行計划:
{
"plan": {
"nodes": [
{
"type": "SingletonNode",
"dependencies": [],
"id": 1,
"estimatedCost": 1,
"estimatedNrItems": 1
},
{
"type": "EnumerateCollectionNode",
"dependencies": [
1
],
"id": 2,
"estimatedCost": 5311,
"estimatedNrItems": 5310,
"database": "_system",
"collection": "LRQ",
"outVariable": {
"id": 0,
"name": "q"
}
},
{
"type": "CalculationNode",
"dependencies": [
2
],
"id": 3,
"estimatedCost": 10621,
"estimatedNrItems": 5310,
"expression": {
"type": "attribute access",
"name": "Profile",
"subNodes": [
{
"type": "attribute access",
"name": "LongRunningQuery",
"subNodes": [
{
"type": "reference",
"name": "q",
"id": 0
}
]
}
]
},
"outVariable": {
"id": 3,
"name": "3"
},
"canThrow": false
},
{
"type": "SortNode",
"dependencies": [
3
],
"id": 4,
"estimatedCost": 56166.713176593075,
"estimatedNrItems": 5310,
"elements": [
{
"inVariable": {
"id": 3,
"name": "3"
},
"ascending": true
}
],
"stable": true
},
{
"type": "AggregateNode",
"dependencies": [
4
],
"id": 5,
"estimatedCost": 61476.713176593075,
"estimatedNrItems": 5310,
"aggregates": [
{
"outVariable": {
"id": 1,
"name": "profile"
},
"inVariable": {
"id": 3,
"name": "3"
}
}
],
"outVariable": {
"id": 2,
"name": "profiles"
}
},
{
"type": "CalculationNode",
"dependencies": [
5
],
"id": 6,
"estimatedCost": 66786.71317659307,
"estimatedNrItems": 5310,
"expression": {
"type": "array",
"subNodes": [
{
"type": "array element",
"name": "Profile",
"subNodes": [
{
"type": "reference",
"name": "profile",
"id": 1
}
]
},
{
"type": "array element",
"name": "Count",
"subNodes": [
{
"type": "function call",
"name": "LENGTH",
"subNodes": [
{
"type": "list",
"subNodes": [
{
"type": "reference",
"name": "profiles",
"id": 2
}
]
}
]
}
]
}
]
},
"outVariable": {
"id": 4,
"name": "4"
},
"canThrow": false
},
{
"type": "ReturnNode",
"dependencies": [
6
],
"id": 7,
"estimatedCost": 72096.71317659307,
"estimatedNrItems": 5310,
"inVariable": {
"id": 4,
"name": "4"
}
}
],
"rules": [],
"collections": [
{
"name": "LRQ",
"type": "read"
}
],
"variables": [
{
"id": 0,
"name": "q"
},
{
"id": 1,
"name": "profile"
},
{
"id": 4,
"name": "4"
},
{
"id": 2,
"name": "profiles"
},
{
"id": 3,
"name": "3"
}
],
"estimatedCost": 72096.71317659307,
"estimatedNrItems": 5310
},
"warnings": []
}
2014年12月5日更新:
以下是其他措施:理解,謝謝。 這是輸出:
AQL_EXECUTE('FOR q LRQ FILTER q.LongRunningQuery.Profile ==“ Admin” LIMIT 1 RETURN q.LongRunningQuery.Profile',{},{profile:true})。profile-> {“初始化”:0 ,“解析”:0,“優化ast”:15.364980936050415,“實例化計划”:0,“優化計划”:0,“執行中”:0}
執行AQL_EXECUTE('FOR q IN LRQ COLLECT profile = q.LongRunningQuery.Profile INTO profile RETURN {“ Profile”:profile,“ Count”:LENGTH(profiles)}',{},{profile:true})。profile- -> {“正在初始化”:0,“正在解析”:0,“優化ast”:0,“實例化計划”:0,“優化計划”:0,“正在執行”:77.88313102722168}
從2.3.2版本開始,查詢arangosh [_system]> stmt2 = db._createStatement('FOR q IN LRQ COLLECT profile = q.LongRunningQuery.Profile INTO profile RETURN {“” Profile:“,” Count“:LENGTH(個人資料)}')
看起來像這樣:
arangosh [_system]> stmt2.explain()
{
"plan" : {
"nodes" : [
{
"type" : "SingletonNode",
"dependencies" : [ ],
"id" : 1,
"estimatedCost" : 1,
"estimatedNrItems" : 1
},
{
"type" : "IndexRangeNode",
"dependencies" : [
1
],
"id" : 8,
"estimatedCost" : 5311,
"estimatedNrItems" : 5310,
"database" : "_system",
"collection" : "LRQ",
"outVariable" : {
"id" : 0,
"name" : "q"
},
"ranges" : [
[ ]
],
"index" : {
"type" : "skiplist",
"id" : "530975525379",
"unique" : false,
"fields" : [
"LongRunningQuery.Profile"
]
},
"reverse" : false
},
{
"type" : "CalculationNode",
"dependencies" : [
8
],
"id" : 3,
"estimatedCost" : 10621,
"estimatedNrItems" : 5310,
"expression" : {
"type" : "attribute access",
"name" : "Profile",
"subNodes" : [
{
"type" : "attribute access",
"name" : "LongRunningQuery",
"subNodes" : [
{
"type" : "reference",
"name" : "q",
"id" : 0
}
]
}
]
},
"outVariable" : {
"id" : 3,
"name" : "3"
},
"canThrow" : false
},
{
"type" : "AggregateNode",
"dependencies" : [
3
],
"id" : 5,
"estimatedCost" : 15931,
"estimatedNrItems" : 5310,
"aggregates" : [
{
"outVariable" : {
"id" : 1,
"name" : "profile"
},
"inVariable" : {
"id" : 3,
"name" : "3"
}
}
],
"outVariable" : {
"id" : 2,
"name" : "profiles"
}
},
{
"type" : "CalculationNode",
"dependencies" : [
5
],
"id" : 6,
"estimatedCost" : 21241,
"estimatedNrItems" : 5310,
"expression" : {
"type" : "array",
"subNodes" : [
{
"type" : "array element",
"name" : "Profile",
"subNodes" : [
{
"type" : "reference",
"name" : "profile",
"id" : 1
}
]
},
{
"type" : "array element",
"name" : "Count",
"subNodes" : [
{
"type" : "function call",
"name" : "LENGTH",
"subNodes" : [
{
"type" : "list",
"subNodes" : [
{
"type" : "reference",
"name" : "profiles",
"id" : 2
}
]
}
]
}
]
}
]
},
"outVariable" : {
"id" : 4,
"name" : "4"
},
"canThrow" : false
},
{
"type" : "ReturnNode",
"dependencies" : [
6
],
"id" : 7,
"estimatedCost" : 26551,
"estimatedNrItems" : 5310,
"inVariable" : {
"id" : 4,
"name" : "4"
}
}
],
"rules" : [
"use-index-for-sort"
],
"collections" : [
{
"name" : "LRQ",
"type" : "read"
}
],
"variables" : [
{
"id" : 0,
"name" : "q"
},
{
"id" : 1,
"name" : "profile"
},
{
"id" : 4,
"name" : "4"
},
{
"id" : 2,
"name" : "profiles"
},
{
"id" : 3,
"name" : "3"
}
],
"estimatedCost" : 26551,
"estimatedNrItems" : 5310
},
"warnings" : [ ],
"stats" : {
"rulesExecuted" : 25,
"rulesSkipped" : 0,
"plansCreated" : 1
}
}
嗯,看看說明中有一個sortnode,而您的查詢沒有提供排序? 收集可能會使優化器無法使用索引(然后您將擁有一個IndexRangeNode而不是EnumerateCollectionNode)
如果傳遞查詢的options參數(db._query()的第4個參數){profile:true},它將輸出各階段使用的時間; 您可以重新運行您的查詢,然后向我們顯示回復嗎?
COLLECT
語句需要排序的輸入。 因此,即使原始查詢字符串不包含顯式SORT
語句,也會將SORT
語句自動添加到執行計划中。
這就是為什么SortNode出現在計划中的原因。 如果sort屬性上有一個跳過列表索引(在本例中為LongRunningQuery.Profile
),將優化LongRunningQuery.Profile
。 因此,在屬性上添加一個skiplist索引可以加快速度,因為可以節省(昂貴)的排序步驟。
如果已經設置了這樣的索引並運行查詢,則它應該比僅存在哈希索引時要快。 實際上,原始查詢應該已經忽略了哈希索引。
如果已經設置了跳過列表索引並解釋了該查詢,則還應該看到不再有SortNode。
從ArangoDB 2.4(目前處於開發階段)開始,添加了僅用於計算方面的更有效的語法:
FOR q IN LRQ
COLLECT profile = q.LongRunningQuery.Profile WITH COUNT INTO numProfiles
RETURN { "Profile" : profile, "Count" : numProfiles }
這樣可以進一步加快查詢速度。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.