[英]Optimizing MongoDB Aggregation Pipeline (Group, Lookup, Match)
我是 NoSQL 数据库的新手,我选择 MongoDB 作为我的第一个 NoSQL 数据库。 我制作了一个聚合管道来显示我想要的数据,这是我的文档示例:
来自用户集合的文档样本
{
"_id": 9,
"name": "Sample Name",
"email": "email@example.com",
"password": "password hash"
}
来自 Pages Collection 的文档示例(这个并不重要)
{
"_id": 42,
"name": "Product Name",
"description": "Product Description",
"user_id": 8,
"rating_categories": [{
"_id": 114,
"name": "Build Quality"
}, {
"_id": 115,
"name": "Price"
}, {
"_id": 116,
"name": "Feature"
}, {
"_id": 117,
"name": "Comfort"
}, {
"_id": 118,
"name": "Switch"
}]
}
来自 Reviews Collection 的文档样本
{
"_id": 10,
"page_id": 42, #ID reference from pages collection
"user_id": 8, #ID reference from users collection
"review": "The review of the product",
"ratings": [{
"_id": 114, #ID Reference from pages collection of what rating category it is
"rating": 5
}, {
"_id": 115,
"rating":4
}, {
"_id": 116,
"rating": 5
}, {
"_id": 117,
"rating": 3
}, {
"_id": 118,
"rating": 4
}],
"created": "1582825968963", #Date Object
"votes": {
"downvotes": [],
"upvotes": [9] #IDs of users who upvote this review
}
}
我想通过 page_id 获得评论,可以从我制作的 API 访问它,这是聚合的预期结果:
[
{
"_id": 10, #Review of the ID
"created": "Thu, 27 Feb 2020 17:52:48 GMT",
"downvote_count": 0, #Length of votes.downvotes from reviews collection
"page_id": 42, #Page ID
"ratings": [ #Stores what rate at what rating category id
{
"_id": 114,
"rating": 5
},
{
"_id": 115,
"rating": 4
},
{
"_id": 116,
"rating": 5
},
{
"_id": 117,
"rating": 3
},
{
"_id": 118,
"rating": 4
}
],
"review": "The Review",
"upvote_count": 0, #Length of votes.upvotes from reviews collection
"user": { #User who reviewed
"_id": 8, #User ID
"downvote_count": 0, #How many downvotes this user receive from all of the user's reviews
"name": "Sample Name", #Username
"review_count": 1, #How many reviews the user made
"upvote_count": 1 #How many upvotes this user receive from all of the user's reviews
},
"vote_state": 0 #Determining vote state from the user (who requested to the API) for this review, 0 for no vote, -1 for downvote, 1 for upvote
},
...
]
这是我为上述结果制作的评论集合聚合管道:
user_id = 9
page_id = 42
pipeline = [
{"$group": {
"_id": {"user_id":"$user_id", "page_id": "$page_id"},
"review_id": {"$last": "$_id"},
"page_id": {"$last": "$page_id"},
"user_id" : {"$last": "$user_id"},
"ratings": {"$last": "$ratings"},
"review": {"$last": "$review"},
"created": {"$last": "$created"},
"votes": {"$last": "$votes"},
"upvote_count": {"$sum":
{"$cond": [
{"$ifNull": ["$votes.upvotes", False]},
{"$size": "$votes.upvotes"},
0
]}
},
"downvote_count": {"$sum":
{"$cond": [
{"$ifNull": ["$votes.downvotes", False]},
{"$size": "$votes.downvotes"},
0
]}
}}},
{"$lookup": {
"from": "users",
"localField": "user_id",
"foreignField": "_id",
"as": "user"
}},
{"$unwind": "$user"},
{"$lookup": {
"from": "reviews",
"localField": "user._id",
"foreignField": "user_id",
"as": "user.reviews"
}},
{"$addFields":{
"_id": "$review_id",
"user.review_count": {"$size": "$user.reviews"},
"user.upvote_count": {"$sum":{
"$map":{
"input":"$user.reviews",
"in":{"$cond": [
{"$ifNull": ["$$this.votes.upvotes", False]},
{"$size": "$$this.votes.upvotes"},
0
]}
}
}},
"user.downvote_count": {"$sum":{
"$map":{
"input":"$user.reviews",
"in":{"$cond": [
{"$ifNull": ["$$this.votes.downvotes", False]},
{"$size": "$$this.votes.downvotes"},
0
]}
}
}},
"vote_state": {"$switch": {
"branches": [
{"case": { "$and" : [
{"$ifNull": ["$votes.upvotes", False]},
{"$in": [user_id, "$votes.upvotes"]}
]}, "then": 1
},
{"case": { "$and" : [
{"$ifNull": ["$votes.downvotes", False]},
{"$in": [user_id, "$votes.downvotes"]}
]}, "then": -1
},
],
"default": 0
}},
}},
{"$project":{
"user.password": 0,
"user.email": 0,
"user_id": 0,
"review_id" : 0,
"votes": 0,
"user.reviews": 0
}},
{"$sort": {"created": -1}},
{"$match": {"page_id": page_id}},
]
注意:用户可以对同一个page_id进行多次评论,但只会显示最新的
顺便说一句,我正在使用 pymongo,这就是运营商有引号的原因
我的问题是:
是否有优化我的聚合管道的空间?
有多个小聚合执行来获取上述数据是否被认为是一种好习惯,还是有 1 个大聚合(或尽可能少)来获取我想要的数据总是更好?
如您所见,每次我想从review
集合的文档中访问votes.upvotes
或votes.downvotes
,我都会检查该字段是否为空,那是因为该字段votes.upvotes
和votes.downvotes
不是在用户进行评论时进行,而不是在用户对该评论进行投票时进行。 当用户进行评论并删除$ifNull
时,我应该在votes.upvotes
和votes.downvotes
上创建一个空字段吗? 这会提高聚合的性能吗?
谢谢
检查此聚合是否具有更好的性能。
如果您还没有,请创建这些索引:
db.reviews.create_index([("page_id", 1)])
注意:我们可以进一步提高性能,避免再次$lookup
评论。
db.reviews.aggregate([
{
$match: {
page_id: page_id
}
},
{
$addFields: {
request_user_id: user_id
}
},
{
$group: {
_id: {
page_id: "$page_id",
user_id: "$user_id",
request_user_id: "$request_user_id"
},
data: {
$push: "$$ROOT"
}
}
},
{
$lookup: {
"from": "users",
"let": {
root_user_id: "$_id.user_id"
},
"pipeline": [
{
$match: {
$expr: {
$eq: [
"$$root_user_id",
"$_id"
]
}
}
},
{
$lookup: {
"from": "reviews",
"let": {
root_user_id: "$$root_user_id"
},
"pipeline": [
{
$match: {
$expr: {
$eq: [
"$$root_user_id",
"$user_id"
]
}
}
},
{
$project: {
user_id: 1,
downvote_count: {
$size: "$votes.downvotes"
},
upvote_count: {
$size: "$votes.upvotes"
}
}
},
{
$group: {
_id: null,
review_count: {
$sum: {
$cond: [
{
$eq: [
"$$root_user_id",
"$user_id"
]
},
1,
0
]
}
},
upvote_count: {
$sum: "$upvote_count"
},
downvote_count: {
$sum: "$downvote_count"
}
}
},
{
$unset: "_id"
}
],
"as": "stats"
}
},
{
$project: {
tmp: {
$mergeObjects: [
{
_id: "$_id",
name: "$name"
},
{
$arrayElemAt: [
"$stats",
0
]
}
]
}
}
},
{
$replaceWith: "$tmp"
}
],
"as": "user"
}
},
{
$addFields: {
first: {
$mergeObjects: [
"$$ROOT",
{
$arrayElemAt: [
"$data",
0
]
},
{
user: {
$arrayElemAt: [
"$user",
0
]
},
created: {
$toDate: {
$toLong: {
$arrayElemAt: [
"$data.created",
0
]
}
}
},
downvote_count: {
$reduce: {
input: "$data.votes.downvotes",
initialValue: 0,
in: {
$add: [
"$$value",
{
$size: "$$this"
}
]
}
}
},
upvote_count: {
$reduce: {
input: "$data.votes.upvotes",
initialValue: 0,
in: {
$add: [
"$$value",
{
$size: "$$this"
}
]
}
}
},
vote_state: {
$cond: [
{
$gt: [
{
$size: {
$filter: {
input: "$data.votes.upvotes",
cond: {
$in: [
"$_id.request_user_id",
"$$this"
]
}
}
}
},
0
]
},
1,
{
$cond: [
{
$gt: [
{
$size: {
$filter: {
input: "$data.votes.downvotes",
cond: {
$in: [
"$_id.request_user_id",
"$$this"
]
}
}
}
},
0
]
},
-1,
0
]
}
]
}
}
]
}
}
},
{
$unset: [
"first.data",
"first.votes",
"first.user_id",
"first.request_user_id"
]
},
{
$replaceWith: "$first"
},
{
"$sort": {
"created": -1
}
}
])
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.