[英]How to reduce time complexity under c++ with nested loops and regex?
我有這樣的function。
輸入參數 - 用戶名向量、字符串向量、頂級用戶數。
首先,我計算字符串中每個用戶的出現次數。 如果一個字符串中有多次出現 - 它仍然計為 1。
然后我按出現次數對其進行排序。 如果出現次數相等 - 按字母順序對用戶名進行排序。
而 function 返回出現次數最多的前 N 個用戶。
std::vector<std::string> GetTopUsers(const std::vector<std::string>& users,
const std::vector<std::string>& lines, const int topUsersNum) {
std::vector<std::pair<std::string, int>> userOccurancies;
//count user occurancies
for (const auto & user : users) {
int count = 0;
for (const auto &line : lines) {
std::regex rgx("\\b" + user + "\\b", std::regex::icase);
std::smatch match;
if (std::regex_search(line, match, rgx)) {
++count;
auto userIter = std::find_if(userOccurancies.begin(), userOccurancies.end(),
[&user](const std::pair<std::string, int>& element) { return element.first == user; });
if (userIter == userOccurancies.end()) {
userOccurancies.push_back(std::make_pair(user, count));
}
else {
userIter->second = count;
}
}
}
}
//sort by amount of occurancies, if occurancies are equal - sort alphabetically
std::sort(userOccurancies.begin(), userOccurancies.end(),
[](const std::pair<std::string, int>& p1, const std::pair<std::string, int>& p2)
{ return (p1.second > p2.second) ? true : (p1.second == p2.second ? p1.first < p2.first : false); });
//extract top N users
int topUsersSz = (topUsersNum <= userOccurancies.size() ? topUsersNum : userOccurancies.size());
std::vector<std::string> topUsers(topUsersSz);
for (int i = 0; i < topUsersSz; i++) {
topUsers.push_back(userOccurancies[i].first);
}
return topUsers;
}
所以對於輸入
std::vector<std::string> users = { "john", "atest", "qwe" };
std::vector<std::string> lines = { "atest john", "Qwe", "qwe1", "qwe," };
int topUsersNum = 4;
output 將是qwe atest john
但它看起來非常復雜。 O(n^2) for 內部循環 + 正則表達式。 它必須是 O(n^3) 甚至更多。
您能否給我一些建議,如何在 c++11 中降低復雜性?
並給我關於代碼的建議。
或者也許有更好的板來解決有關復雜性和性能的問題?
謝謝你。
UDP
std::vector<std::string> GetTopUsers2(const std::vector<std::string>& users,
const std::vector<std::string>& lines, const size_t topUsersNum) {
std::vector<std::pair<std::string, int>> userOccurancies(users.size());
auto userOcIt = userOccurancies.begin();
for (const auto & user : users) {
userOcIt->first = std::move(user);
userOcIt->second = 0;
userOcIt++;
}
//count user occurancies
for (auto &user: userOccurancies) {
int count = 0;
std::regex rgx("\\b" + user.first + "\\b", std::regex::icase);
std::smatch match;
for (const auto &line : lines) {
if (std::regex_search(line, match, rgx)) {
++count;
user.second = count;
}
}
}
//sort by amount of occurancies, if occurancies are equal - sort alphabetically
std::sort(userOccurancies.begin(), userOccurancies.end(),
[](const std::pair<std::string, int>& p1, const std::pair<std::string, int>& p2)
{ return (p1.second > p2.second) ? true : (p1.second == p2.second ? p1.first < p2.first : false); });
//extract top N users
auto middle = userOccurancies.begin() + std::min(topUsersNum, userOccurancies.size());
int topUsersSz = (topUsersNum <= userOccurancies.size() ? topUsersNum : userOccurancies.size());
std::vector<std::string> topUsers(topUsersSz);
auto topIter = topUsers.begin();
for (auto iter = userOccurancies.begin(); iter != middle; iter++) {
*topIter = std::move(iter->first);
topIter++;
}
return topUsers;
}
感謝@Jarod42。 我更新了第一部分。 我認為在構造函數中將 memory 分配給向量一次比每次調用emplace_back
,所以我使用了它。 如果我錯了 - 標記我。
我也使用 c++11,而不是 c++17。
時間結果:
Old: 3539400.00000 nanoseconds
New: 2674000.00000 nanoseconds
它更好,但看起來仍然很復雜,不是嗎?
構造正則表達式的成本很高,並且可以移到循環之外:
你也可以移動字符串而不是復制。
您不需要對所有范圍進行排序。 std::partial_sort
就足夠了。
更重要的是,您可能會避免使用內部find_if
。
std::vector<std::string>
GetTopUsers(
std::vector<std::string> users,
const std::vector<std::string>& lines,
int topUsersNum)
{
std::vector<std::pair<std::string, std::size_t> userCount;
userCount.reserve(users.size());
for (auto& user : users) {
userCount.emplace_back(std::move(user), 0);
}
for (auto& [user, count] : userCount) {
std::regex rgx("\\b" + user + "\\b", std::regex::icase);
for (const auto &line : lines) {
std::smatch match;
if (std::regex_search(line, match, rgx)) {
++count;
}
}
}
//sort by amount of occurancies, if occurancies are equal - sort alphabetically
auto middle = userCount.begin() + std::min(topUsersNum, userCount.size());
std::partial_sort(userCount.begin(),
middle,
userCount.end(),
[](const auto& lhs, const auto& rhs)
{
return std::tie(rhs.second, lhs.first) < std::tie(lhs.second, rhs.first);
});
//extract top N users
std::vector<std::string> topUsers;
topUsers.reserve(std::distance(userCount.begin(), middle));
for (auto it = userCount.begin(); it != middle; ++it) {
topUsers.push_back(std::move(it->first));
}
return topUsers;
}
我不是專業的編碼員,但我已經讓你的代碼更快了(大約快 90%,除非我的數學錯誤或者我計時錯誤)。
它的作用是遍歷每一行,並為每一行計算給定每個用戶的出現次數。 如果當前用戶的出現次數大於前一個用戶,則將用戶移動到向量的開頭。
#include <iostream>
#include <Windows.h>
#include <vector>
#include <string>
#include <regex>
#include <algorithm>
#include <chrono>
std::vector<std::string> GetTopUsers(const std::vector<std::string>& users,
const std::vector<std::string>& lines, const int topUsersNum) {
std::vector<std::pair<std::string, int>> userOccurancies;
//count user occurancies
for (const auto & user : users) {
int count = 0;
for (const auto &line : lines) {
std::regex rgx("\\b" + user + "\\b", std::regex::icase);
std::smatch match;
if (std::regex_search(line, match, rgx)) {
++count;
auto userIter = std::find_if(userOccurancies.begin(), userOccurancies.end(),
[&user](const std::pair<std::string, int>& element) { return element.first == user; });
if (userIter == userOccurancies.end()) {
userOccurancies.push_back(std::make_pair(user, count));
}
else {
userIter->second = count;
}
}
}
}
//sort by amount of occurancies, if occurancies are equal - sort alphabetically
std::sort(userOccurancies.begin(), userOccurancies.end(),
[](const std::pair<std::string, int>& p1, const std::pair<std::string, int>& p2)
{ return (p1.second > p2.second) ? true : (p1.second == p2.second ? p1.first < p2.first : false); });
//extract top N users
int topUsersSz = (topUsersNum <= userOccurancies.size() ? topUsersNum : userOccurancies.size());
std::vector<std::string> topUsers(topUsersSz);
for (int i = 0; i < topUsersSz; i++) {
topUsers.push_back(userOccurancies[i].first);
}
return topUsers;
}
unsigned int count_user_occurences(
std::string & line,
std::string & user
)
{
unsigned int occur = {};
std::string::size_type curr_index = {};
// while we can find the name of the user in the line, and we have not reached the end of the line
while((curr_index = line.find(user, curr_index)) != std::string::npos)
{
// increase the number of occurences
++occur;
// increase string index to skip the current user
curr_index += user.length();
}
// return the number of occurences
return occur;
}
std::vector<std::string> get_top_users(
std::vector<std::string> & user_list,
std::vector<std::string> & line_list
)
{
// create vector to hold results
std::vector<std::string> top_users = {};
// put all of the users inside the "top_users" vector
top_users = user_list;
// make sure none of the vectors are empty
if(false == user_list.empty()
&& false == line_list.empty())
{
// go trough each one of the lines
for(unsigned int i = {}; i < line_list.size(); ++i)
{
// holds the number of occurences for the previous user
unsigned int last_user_occur = {};
// go trough each one of the users (we copied the list into "top_users")
for(unsigned int j = {}; j < top_users.size(); ++j)
{
// get the number of the current user in the current line
unsigned int curr_user_occur = count_user_occurences(line_list.at(i), top_users.at(j));
// user temporary name holder
std::string temp_user = {};
// if the number of occurences of the current user is larger than the one of the previous user, move it at the top
if(curr_user_occur >= last_user_occur)
{
// save the current user's name
temp_user = top_users.at(j);
// erase the user from its current position
top_users.erase(top_users.begin() + j);
// move the user at the beginning of the vector
top_users.insert(top_users.begin(), temp_user);
}
// save the occurences of the current user to compare further users
last_user_occur = curr_user_occur;
}
}
}
// return the top user vector
return top_users;
}
int main()
{
std::vector<std::string> users = { "john", "atest", "qwe" };
std::vector<std::string> lines = { "atest john", "Qwe", "qwel", "qwe," };
// time the first function
auto start = std::chrono::high_resolution_clock::now();
std::vector<std::string> top_users = get_top_users(users, lines);
auto stop = std::chrono::high_resolution_clock::now();
// save the time in milliseconds
double time = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count();
// print time
printf("%.05f nanoseconds\n", time);
// time the second function
auto start2 = std::chrono::high_resolution_clock::now();
std::vector<std::string> top_users2 = GetTopUsers(users, lines, 4);
auto stop2 = std::chrono::high_resolution_clock::now();
// save the time in milliseconds
double time2 = std::chrono::duration_cast<std::chrono::nanoseconds>(stop2 - start2).count();
// print time
printf("%.05f nanoseconds", time2);
getchar();
return 0;
}
結果(至少對於我的 PC,它們在多次運行中非常一致):
366800.00000 nanoseconds
4235900.00000 nanoseconds
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.