[英]is there any way to speed up the permuation creation for an anagramsolver?
我目前正在嘗試制作一個非常快速的字謎求解器,而現在它受到排列創建的瓶頸。 是否有另一種方法來完成整個程序或優化排列創建? 這是我的代碼:
#include <string>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <unordered_set>
#include <vector>
#include <boost/asio/thread_pool.hpp>
#include <boost/asio/post.hpp>
void get_permutations(std::string s, std::vector<std::string> &permutations)
{
std::sort(s.begin(), s.end());
do
{
permutations.push_back(s);
} while (std::next_permutation(s.begin(), s.end()));
}
void load_file(std::unordered_set<std::string> &dictionary, std::string filename)
{
std::ifstream words(filename);
std::string element;
while (words >> element)
{
std::transform(element.begin(), element.end(), element.begin(), ::tolower);
dictionary.insert(element);
}
}
void print_valid(const std::unordered_set<std::string>& dictionary, const std::vector<std::string>::const_iterator start, const std::vector<std::string>::const_iterator stop)
{
for (auto iter = start; iter != stop; iter++)
{
if (dictionary.contains(*iter) == true)
{
std::cout << *iter << "\n";
}
}
}
int main()
{
const std::string s = "asdfghjklq";
std::vector<std::string> permutations;
boost::asio::thread_pool pool(2);
std::cout << "Loading english dictionary\n";
std::unordered_set<std::string> dictionary;
load_file(dictionary, "words");
std::cout << "Done\n";
//std::cout << "Enter the anagram: ";
//getline(std::cin, s);
clock_t start = clock();
get_permutations(s, permutations);
//std::cout << permutations.size() << std::endl;
std::cout << "finished permutations\n";
if (permutations.size() > 500000)
{
std::cout << "making new\n";
for (size_t pos = 0; pos < permutations.size(); pos += (permutations.size() / 3))
{
boost::asio::post(pool, [&dictionary, &permutations, pos] { print_valid(dictionary, (permutations.begin() + pos), (permutations.begin() + pos + (permutations.size() /3) ) ); });
}
pool.join();
}
else
{
print_valid(dictionary, permutations.begin(), permutations.end());
}
clock_t finish = clock();
double time_elapsed = (finish - start) / static_cast<double>(CLOCKS_PER_SEC);
std::cout << time_elapsed << "\n";
std::cout << permutations.size() << std::endl;
return 0;
}
排列的創建是在get_permutations
線程池是為了測試非常大的排列集
想想你會如何手動處理這個問題 - 你如何檢查兩個單詞是否是彼此的字謎?
例如: banana
<-> aaannb
你會如何在一張紙上解決這個問題? 你會創建所有 720 個排列並檢查是否有任何一個匹配? 或者有沒有更簡單、更直觀的方法?
那么是什么讓一個詞成為另一個詞的變位詞,即需要滿足什么條件?
這都是關於字母計數的。 如果兩個單詞包含相同數量的所有字母,則它們是彼此的字謎。
例如:
banana
-> 3x a, 2x n, 1x b
aaannb
-> 3x a, 2x n, 1x b
有了這些知識,你能構建一個不需要迭代所有可能排列的算法嗎?
我只建議您在嘗試提出自己的優化算法后閱讀此內容
您只需要為字典單詞建立一個字母計數查找表,例如:
1x a, 1x n -> ["an"]
3x a, 1x b, 2x n -> ["banana", "nanaba"]
1x a, 1x p, 1x r, 1x t -> ["part", "trap"]
... etc ...
然后你可以將你的搜索詞分解成字母數,例如banana
-> 3x a, 1x b, 2x n
並在你的查找表中搜索分解。
結果將是您的字典中的單詞列表,您可以使用給定的字母集合構建 - 也就是給定字符串的所有可能的字謎。
假設某種名為letter_counts
的結構包含字母組合,算法可能如下所示:
std::vector<std::string> find_anagrams(std::vector<std::string> const& dictionary, std::string const& wordToCheck) {
// build a lookup map for letter composition -> word
std::unordered_map<letter_counts, std::vector<std::string>> compositionMap;
for(auto& str : dictionary)
compositionMap[letter_counts{str}].push_back(str);
// get all words that are anagrams of the given one
auto it = compositionMap.find(letter_counts{wordToCheck});
// no matches in dictionary
if(it == compositionMap.end())
return {};
// list of all anagrams
auto result = it->second;
// remove workToCheck from result if it is present
result.erase(std::remove_if(result.begin(), result.end(), [&wordToCheck](std::string const& str) { return str == wordToCheck; }), result.end());
return result;
}
這將在O(n)
時間內運行,並且空間復雜度為O(n)
,其中 n 是字典中的單詞數。
(如果您不將 compositionMap 的構造作為算法的一部分包括在內,那么這將是O(1)
時間)
與基於排列的方法相比,它具有O(n!)
時間復雜度(或者我喜歡稱之為O(scary)
)。
這是一個僅處理字母 az 的完整代碼示例,但您可以輕松修改letter_counts
以使其也適用於其他字符:
#include <string_view>
#include <cctype>
#include <vector>
#include <string>
#include <unordered_map>
#include <iostream>
struct letter_counts {
static const int num_letters = 26;
int counts[num_letters];
explicit letter_counts(std::string_view str) : counts{0} {
for(char c : str) {
c = std::tolower(c);
if(c >= 'a' && c <= 'z')
counts[c - 'a']++;
}
}
};
bool operator==(letter_counts const& lhs, letter_counts const& rhs) {
for(int i = 0; i < letter_counts::num_letters; i++) {
if(lhs.counts[i] != rhs.counts[i]) return false;
}
return true;
}
template <class T>
inline void hash_combine(std::size_t& seed, const T& v)
{
std::hash<T> hasher;
seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2);
}
namespace std {
template<>
struct hash<letter_counts> {
size_t operator()(const letter_counts& letterCounts) const
{
size_t result = 0;
auto hasher = std::hash<int>{};
for(int i : letterCounts.counts)
hash_combine(result, hasher(i));
return result;
}
};
}
std::vector<std::string> find_anagrams(std::vector<std::string> const& dictionary, std::string const& wordToCheck) {
// build a lookup map for letter composition -> word
std::unordered_map<letter_counts, std::vector<std::string>> compositionMap;
for(auto& str : dictionary)
compositionMap[letter_counts{str}].push_back(str);
// get all words that are anagrams of the given one
auto it = compositionMap.find(letter_counts{wordToCheck});
// no matches in dictionary
if(it == compositionMap.end())
return {};
// list of all anagrams
auto result = it->second;
// remove workToCheck from result if it is present
result.erase(std::remove_if(result.begin(), result.end(), [&wordToCheck](std::string const& str) { return str == wordToCheck; }), result.end());
return result;
}
int main() {
std::vector<std::string> dict = {
"banana",
"nanaba",
"foobar",
"bazinga"
};
std::string word = "aaannb";
for(auto& str : find_anagrams(dict, word)) {
std::cout << str << std::endl;
}
}
您擁有的排列方法太慢了,特別是因為 n 個不同字符的字符串的排列數量以超指數方式擴展。 嘗試諸如散列和相等謂詞之類的方法,其中 hash 基於已排序的字符串,而相等謂詞僅測試 2 個字符串的排序版本是否相等。 您可以使用 boost::unordered_map 創建自定義 hash 函數並將適合字謎的單詞添加到鍵集中。
請注意,組合的數量往往會很快變得非常大。 如果您按字母順序對兩個單詞的字符進行排序,然后排序的字符串匹配,則兩個單詞是字謎。 基於這一事實,我制作了以下示例,將字典放入多重映射中,可以快速找到單詞的所有字謎。 它通過使用按字母順序排序的輸入字符串作為 map 的鍵來實現這一點。
現場演示: https://onlinegdb.com/fXUVZruwq
#include <algorithm>
#include <iostream>
#include <locale>
#include <map>
#include <vector>
#include <set>
// create a class to hold anagram information
class anagram_dictionary_t
{
public:
// create a dictionary based on an input list of words.
template<typename std::size_t N>
explicit anagram_dictionary_t(const std::string (&words)[N])
{
for (std::string word : words)
{
auto key = make_key(word);
std::string lower{ word };
to_lower(lower);
m_anagrams.insert({ key, lower});
}
}
// find all the words that match the anagram
auto find_words(const std::string& anagram)
{
// get the unique key for input word
// this is done by sorting all the characters in the input word alphabetically
auto key = make_key(anagram);
// lookup all the words with the same key in the dictionary
auto range = m_anagrams.equal_range(key);
// create a set of found words
std::set<std::string> words;
for (auto it = range.first; it != range.second; ++it)
{
words.insert(it->second);
}
// return the words
return words;
}
// function to check if two words are an anagram
bool is_anagram(const std::string& anagram, const std::string& word)
{
auto words = find_words(anagram);
return (words.find(word) != words.end());
}
private:
// make a unique key out of an input word
// all anagrams should map to the same key value
static std::string make_key(const std::string& word)
{
std::string key{ word };
to_lower(key);
// two words are anagrams if they sort to the same key
std::sort(key.begin(), key.end());
return key;
}
static void to_lower(std::string& word)
{
for (char& c : word)
{
c = std::tolower(c, std::locale());
}
}
std::multimap<std::string, std::string> m_anagrams;
};
int main()
{
anagram_dictionary_t anagram_dictionary{ {"Apple", "Apricot", "Avocado", "Banana", "Bilberry", "Blackberry", "Blueberry" } };
std::string anagram{ "aaannb"};
auto words = anagram_dictionary.find_words(anagram);
std::cout << "input word = " << anagram << "\n found words : ";
for (const auto& word : words)
{
std::cout << word << "\n";
}
return 0;
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.