[英]C++: Sampling from discrete distribution without replacement
我想從不帶替換的離散分布中采樣(即不重復)。
使用函數distinct_distribution ,可以進行替換采樣。 並且,通過這個函數,我以一種非常粗略的方式實現了無替換采樣:
#include <iostream>
#include <random>
#include <vector>
#include <array>
int main()
{
const int sampleSize = 8; // Size of the sample
std::vector<double> weights = {2,2,1,1,2,2,1,1,2,2}; // 10 possible outcome with different weights
std::random_device rd;
std::mt19937 generator(rd());
/// WITH REPLACEMENT
std::discrete_distribution<int> distribution(weights.begin(), weights.end());
std::array<int, 10> p ={};
for(int i=0; i<sampleSize; ++i){
int number = distribution(generator);
++p[number];
}
std::cout << "Discrete_distribution with replacement:" << std::endl;
for (int i=0; i<10; ++i)
std::cout << i << ": " << std::string(p[i],'*') << std::endl;
/// WITHOUT REPLACEMENT
p = {};
for(int i=0; i<sampleSize; ++i){
std::discrete_distribution<int> distribution(weights.begin(), weights.end());
int number = distribution(generator);
weights[number] = 0; // the weight associate to the sampled value is set to 0
++p[number];
}
std::cout << "Discrete_distribution without replacement:" << std::endl;
for (int i=0; i<10; ++i)
std::cout << i << ": " << std::string(p[i],'*') << std::endl;
return 0;
}
您是否曾經編碼過這種無需替換的采樣? 可能以更優化的方式?
謝謝你。
干杯,
助教
這個解決方案可能會更短一些。 不幸的是,它需要在每一步都創建一個discrete_distribution<>
對象,這在繪制大量樣本時可能會令人望而卻步。
#include <iostream>
#include <boost/random/discrete_distribution.hpp>
#include <boost/random/mersenne_twister.hpp>
using namespace boost::random;
int main(int, char**) {
std::vector<double> w = { 2, 2, 1, 1, 2, 2, 1, 1, 2, 2 };
discrete_distribution<> dist(w);
int n = 10;
boost::random::mt19937 gen;
std::vector<int> samples;
for (auto i = 0; i < n; i++) {
samples.push_back(dist(gen));
w[*samples.rbegin()] = 0;
dist = discrete_distribution<>(w);
}
for (auto iter : samples) {
std::cout << iter << " ";
}
return 0;
}
改進的答案:
在此站點上仔細尋找類似的問題(無需替換的快速加權采樣)后,我發現了一種非常簡單的無需替換的加權采樣算法,只是在 C++ 中實現有點復雜。 請注意,這不是最有效的算法,但在我看來它是最容易實現的算法。
在https://doi.org/10.1016/j.ipl.2005.11.003中詳細描述了該方法。
特別是,如果樣本量遠小於基本總體,則效率不高。
#include <iostream>
#include <iterator>
#include <boost/random/uniform_01.hpp>
#include <boost/random/mersenne_twister.hpp>
using namespace boost::random;
int main(int, char**) {
std::vector<double> w = { 2, 2, 1, 1, 2, 2, 1, 1, 2, 10 };
uniform_01<> dist;
boost::random::mt19937 gen;
std::vector<double> vals;
std::generate_n(std::back_inserter(vals), w.size(), [&dist,&gen]() { return dist(gen); });
std::transform(vals.begin(), vals.end(), w.begin(), vals.begin(), [&](auto r, auto w) { return std::pow(r, 1. / w); });
std::vector<std::pair<double, int>> valIndices;
size_t index = 0;
std::transform(vals.begin(), vals.end(), std::back_inserter(valIndices), [&index](auto v) { return std::pair<double,size_t>(v,index++); });
std::sort(valIndices.begin(), valIndices.end(), [](auto x, auto y) { return x.first > y.first; });
std::vector<int> samples;
std::transform(valIndices.begin(), valIndices.end(), std::back_inserter(samples), [](auto v) { return v.second; });
for (auto iter : samples) {
std::cout << iter << " ";
}
return 0;
}
更簡單的回答
我只是刪除了一些 STL 函數並用簡單的 for 循環替換了它。
#include <iostream>
#include <iterator>
#include <boost/random/uniform_01.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <algorithm>
using namespace boost::random;
int main(int, char**) {
std::vector<double> w = { 2, 2, 1, 1, 2, 2, 1, 1, 2, 1000 };
uniform_01<> dist;
boost::random::mt19937 gen(342575235);
std::vector<double> vals;
for (auto iter : w) {
vals.push_back(std::pow(dist(gen), 1. / iter));
}
// Sorting vals, but retain the indices.
// There is unfortunately no easy way to do this with STL.
std::vector<std::pair<int, double>> valsWithIndices;
for (size_t iter = 0; iter < vals.size(); iter++) {
valsWithIndices.emplace_back(iter, vals[iter]);
}
std::sort(valsWithIndices.begin(), valsWithIndices.end(), [](auto x, auto y) {return x.second > y.second; });
std::vector<size_t> samples;
int sampleSize = 8;
for (auto iter = 0; iter < sampleSize; iter++) {
samples.push_back(valsWithIndices[iter].first);
}
for (auto iter : samples) {
std::cout << iter << " ";
}
return 0;
}
Aleph0 的現有答案在我測試過的答案中效果最好。 我嘗試對原始解決方案(由 Aleph0 添加的解決方案)和新解決方案進行基准測試,其中只有在現有解決方案超過 50% 的已添加項目時才創建新的discrete_distribution
分布(當分布產生樣本中已有的項目時重新繪制)。
我用樣本大小==人口大小進行了測試,權重等於指數。 我認為問題中的原始解決方案在O(n^2)
運行,我的新解決方案在O(n logn)
中運行,而論文中的一個似乎在O(n)
運行。
-------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------
BM_Reuse 25252721 ns 25251731 ns 26
BM_NewDistribution 17338706125 ns 17313620000 ns 1
BM_SomePaper 6789525 ns 6779400 ns 100
代碼:
#include <array>
#include <benchmark/benchmark.h>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_01.hpp>
#include <iostream>
#include <iterator>
#include <random>
#include <vector>
const int sampleSize = 20000;
using namespace boost::random;
static void BM_ReuseDistribution(benchmark::State &state) {
std::vector<double> weights;
weights.resize(sampleSize);
for (auto _ : state) {
for (int i = 0; i < sampleSize; i++) {
weights[i] = i + 1;
}
std::random_device rd;
std::mt19937 generator(rd());
int o[sampleSize];
std::discrete_distribution<int> distribution(weights.begin(),
weights.end());
int numAdded = 0;
int distSize = sampleSize;
for (int i = 0; i < sampleSize; ++i) {
if (numAdded > distSize / 2) {
distSize -= numAdded;
numAdded = 0;
distribution =
std::discrete_distribution<int>(weights.begin(), weights.end());
}
int number = distribution(generator);
if (!weights[number]) {
i -= 1;
continue;
} else {
weights[number] = 0;
o[i] = number;
numAdded += 1;
}
}
}
}
BENCHMARK(BM_ReuseDistribution);
static void BM_NewDistribution(benchmark::State &state) {
std::vector<double> weights;
weights.resize(sampleSize);
for (auto _ : state) {
for (int i = 0; i < sampleSize; i++) {
weights[i] = i + 1;
}
std::random_device rd;
std::mt19937 generator(rd());
int o[sampleSize];
for (int i = 0; i < sampleSize; ++i) {
std::discrete_distribution<int> distribution(weights.begin(),
weights.end());
int number = distribution(generator);
weights[number] = 0;
o[i] = number;
}
}
}
BENCHMARK(BM_NewDistribution);
static void BM_SomePaper(benchmark::State &state) {
std::vector<double> w;
w.resize(sampleSize);
for (auto _ : state) {
for (int i = 0; i < sampleSize; i++) {
w[i] = i + 1;
}
uniform_01<> dist;
boost::random::mt19937 gen;
std::vector<double> vals;
std::generate_n(std::back_inserter(vals), w.size(),
[&dist, &gen]() { return dist(gen); });
std::transform(vals.begin(), vals.end(), w.begin(), vals.begin(),
[&](auto r, auto w) { return std::pow(r, 1. / w); });
std::vector<std::pair<double, int>> valIndices;
size_t index = 0;
std::transform(
vals.begin(), vals.end(), std::back_inserter(valIndices),
[&index](auto v) { return std::pair<double, size_t>(v, index++); });
std::sort(valIndices.begin(), valIndices.end(),
[](auto x, auto y) { return x.first > y.first; });
std::vector<int> samples;
std::transform(valIndices.begin(), valIndices.end(),
std::back_inserter(samples),
[](auto v) { return v.second; });
}
}
BENCHMARK(BM_SomePaper);
BENCHMARK_MAIN();
感謝您的問題和其他人的好回答,我和您遇到了同樣的問題。 我認為你不需要每次都發布新的發行版,而不是
dist.param({ wts.begin(), wts.end() });
完整代碼如下:
//STL改進方案
#include <iostream>
#include <vector>
#include <random>
#include <iomanip>
#include <map>
#include <set>
int main()
{
//隨機數引擎采用默認引擎
std::default_random_engine rng;
//隨機數引擎采用設備熵值保證隨機性
auto gen = std::mt19937{ std::random_device{}() };
std::vector<int> wts(24); //存儲權重值
std::vector<int> in(24); //存儲總體
std::set<int> out; //存儲抽樣結果
std::map<int, int> count; //輸出計數
int sampleCount = 0; //抽樣次數計數
int index = 0; //抽取的下標
int sampleSize = 24; //抽取樣本的數量
int sampleTimes = 100000; //抽取樣本的次數
//權重賦值
for (int i = 0; i < 24; i++)
{
wts.at(i) = 48 - 2 * i;
}
//總體賦值並輸出
std::cout << "總體為24個:" << std::endl;
//賦值
for (int i = 0; i < 24; i++)
{
in.at(i) = i + 1;
std::cout << in.at(i) << " ";
}
std::cout << std::endl;
//產生按照給定權重的離散分布
std::discrete_distribution<size_t> dist{ wts.begin(), wts.end() };
auto probs = dist.probabilities(); // 返回概率計算結果
//輸出概率計算結果
std::cout << "總體中各數據的權重為:" << std::endl;
std::copy(probs.begin(), probs.end(), std::ostream_iterator<double>
{ std::cout << std::fixed << std::setprecision(5), “ ”});
std::cout << std::endl << std::endl;
//==========抽樣測試==========
for (size_t j = 0; j < sampleTimes; j++)
{
index = dist(gen);
//std::cout << index << “ ”; //輸出抽樣結果
count[index] += 1; //抽樣結果計數
}
double sum = 0.0; //用於概率求和
//輸出抽樣結果
std::cout << "總共抽樣" << sampleTimes << "次," << "各下標的頻數及頻率為:" << std::endl;
for (size_t i = 0; i < 24; i++)
{
std::cout << i << "共有" << count[i] << "個 頻率為:" << count[i] / double(sampleTimes) << std::endl;
sum += count[i] / double(sampleTimes);
}
std::cout << "總頻率為:" << sum << std::endl << std::endl; //輸出總概率
//==========抽樣測試==========
//從總體中抽樣放入集合中,直至集合大小達到樣本數
while (out.size() < sampleSize - 1)
{
index = dist(gen); //抽取下標
out.insert(index); //插入集合
sampleCount += 1; //抽樣次數增加1
wts.at(index) = 0; //將抽取到的下標索引的權重設置為0
dist.param({ wts.begin(), wts.end() });
probs = dist.probabilities(); // 返回概率計算結果
//輸出概率計算結果
std::cout << "總體中各數據的權重為:" << std::endl;
std::copy(probs.begin(), probs.end(), std::ostream_iterator<double>
{ std::cout << std::fixed << std::setprecision(5), “ ”});
std::cout << std::endl << std::endl;
}
//最后一次抽取,單獨出來是避免將所有權重都為0,的權重數組賦值給離散分布dist,避免報錯
index = dist(gen); //抽取下標
out.insert(index); //插入集合
sampleCount += 1; //抽樣次數增加1
//輸出抽樣結果
std::cout << "從總體中抽取的" << sampleSize << "個樣本的下標索引為:" << std::endl;
for (auto iter : out)
{
std::cout << iter << “-”;
}
std::cout << std::endl;
//輸出抽樣次數
std::cout << "抽樣次數為:" << sampleCount << std::endl;
out.clear(); //清空輸出集合,為下次抽樣做准備
std::cin.get(); //保留控制台窗口
return 0;
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.