在C中的数组中找到n个重复数字

Question

I have implemented the problem statement below using the hashtable library functions in C. Since I have never used standard library hashtable in C, my questions are: 我已经在C中使用哈希表库函数实现了以下问题说明。由于我从未在C中使用标准库哈希表，因此我的问题是：

Am I using the hashtable functions correctly (I believe getting output doesn't mean right usage)? 我是否正确使用哈希表函数（我相信获得输出并不意味着正确使用）？
Is there any better way to achieve the solution for the given problem statement? 有没有更好的方法来解决给定问题陈述的解决方案？

Problem statement: Find the n most frequent element in an array. 问题陈述：在数组中找到n个最频繁出现的元素。

1 < N < 100000 [ Length of Array] 1 <N <100000 [数组长度]
-1000000 < n < 1000000 [ Array integers] -1000000 <n <1000000 [数组整数]

I have gone through some of the similar questions at SO - and in one of the answers I do see the recommended approach being to use a hashtable. 我在SO上经历了一些类似的问题-在一个答案中，我确实看到推荐的方法是使用哈希表。

#include <stdio.h>
#include <stdlib.h>
#include <search.h>
#include <stdbool.h>

#define REPEAT 3
#define BUFFERSIZE 10

void freqElement(int* arr, int len, int times);
int createHT(int* arr, int len);

int main(void)
{
    int arr[] = {2, 3, 5, 6, 10, 10, 2, 5, 2};
    int len = sizeof(arr)/sizeof(int);
    ENTRY e;
    ENTRY *ep;

    hcreate(len);

    if (!createHT(arr, len))
    {
        printf(" error in entering data \n");
    }

    freqElement(arr, len, REPEAT);

    hdestroy();
    return 0;
}

int createHT(int* arr, int len)
{
    ENTRY e, *ep;

    for(int i = 0; i < len; i++)
    {
        char buffer[BUFFERSIZE];
        snprintf(buffer, BUFFERSIZE, "%d", arr[i]);
        e.key = buffer;
        e.data = (void *)1;

        ep = hsearch(e, FIND);
        if (ep)
        {
            ep->data = (void *)((int)ep->data + (int)e.data);
        }
        ep = hsearch(e, ENTER);
        if (ep == NULL)
        {
            fprintf(stderr, "entry failed\n");
            exit(EXIT_FAILURE);
        }
    }
    return 1;
}

void freqElement(int* arr, int len, int times)
{
   ENTRY *ep, e;

   for (int i = 0; i < len; i++)
   {
       char buffer[BUFFERSIZE];
       snprintf(buffer, BUFFERSIZE, "%d", arr[i]);
       e.key = buffer;
       ep = hsearch(e, FIND);
       if(ep)
       {
           if((int)ep->data == times)
           {
               printf(" value %s is repeated %d times \n", ep->key, times);
               break;
           }
       }
   }

}

Answer 1

I'm not sure I'd use the hcreate() , hsearch() , hdestroy() triad of functions for this task, but it can be used. 我不确定我是否会hcreate()使用hcreate() ， hsearch() ， hdestroy()三元组函数，但是可以使用它。 The POSIX specification is not clear on some issues, such as the release of the keys by htdestroy() , but the Mac OS X manual says: POSIX规范在某些问题上尚不清楚，例如htdestroy()释放密钥，但Mac OS X手册说：

The hdestroy() function disposes of the search table, and may be followed by another call to hcreate() . hdestroy()函数处理搜索表，并且可能随后又调用了hcreate() 。 After the call to hdestroy() , the data can no longer be considered accessible. 调用hdestroy() ，数据不再被视为可访问。 The hdestroy() function calls free(3) for each comparison key in the search table but not the data item associated with the key. hdestroy()函数为搜索表中的每个比较键调用free(3) ，而不是与该键关联的数据项。

(POSIX doesn't mention hdestroy() calling free() on the comparison keys.) （POSIX并未提及hdestroy()在比较键上调用free() 。）

Here's a relatively simple adaptation of your code that works and runs cleanly under valgrind , at least with GCC 6.1.0 and Valgrind 3.12.0-SVN on Mac OS X 10.11.4. 这是您的代码的相对简单的改编，至少在Mac OS X 10.11.4上的GCC 6.1.0和Valgrind 3.12.0-SVN下，可以在valgrind下正常运行。

$ gcc -O3 -g -std=c11 -Wall -Wextra -Wmissing-prototypes \
>     -Wstrict-prototypes -Wold-style-definition -Werror hs17.c -o hs17
$

Code 码

#include <search.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define BUFFERSIZE 10

void freqElement(int *arr, int len, int times);
int createHT(int *arr, int len);

int main(void)
{
    int arr[] = { 2, 3, 5, 6, 10, 10, 2, 5, 2, 8, 8, 7, 8, 7, 8, 7, };
    int len = sizeof(arr) / sizeof(int);

    if (hcreate(len) == 0)
        fprintf(stderr, "Failed to create hash table of size %d\n", len);
    else
    {
        if (!createHT(arr, len))
            fprintf(stderr, "error in entering data\n");
        else
        {
            for (int i = 1; i < len; i++)
                freqElement(arr, len, i);
        }

        hdestroy();
    }
    return 0;
}

int createHT(int *arr, int len)
{
    ENTRY e, *ep;

    for (int i = 0; i < len; i++)
    {
        char buffer[BUFFERSIZE];
        snprintf(buffer, sizeof(buffer), "%d", arr[i]);
        e.key = strdup(buffer);
        e.data = (void *)0;
        printf("Processing [%s]\n", e.key);

        ep = hsearch(e, ENTER);
        if (ep)
        {
            ep->data = (void *)((intptr_t)ep->data + 1);
            if (ep->key != e.key)
                free(e.key);
        }
        else
        {
            fprintf(stderr, "entry failed for [%s]\n", e.key);
            free(e.key);    // Not dreadfully important
            exit(EXIT_FAILURE);
        }
    }
    return 1;
}

// Check whether this number has been processed before
static bool processed_before(int *arr, int len, int value)
{
    for (int j = 0; j < len; j++)
    {
        if (value == arr[j])
            return true;
    }
    return false;
}

void freqElement(int *arr, int len, int times)
{
    ENTRY *ep, e;

    for (int i = 0; i < len; i++)
    {
        char buffer[BUFFERSIZE];
        snprintf(buffer, BUFFERSIZE, "%d", arr[i]);
        e.key = buffer;
        ep = hsearch(e, FIND);
        if (ep)
        {
            if ((intptr_t)ep->data == times && !processed_before(arr, i, arr[i]))
                printf(" value %s is repeated %d times\n", ep->key, times);
        }
    }
}

The processed_before() function prevents values with multiple entries being printed multiple times — it's a consequence of the change to the freqElement() function that reports on all the entries with the given number of appearances, rather than just the first such entry. freqElement() processed_before()函数可防止多次打印带有多个条目的值-这是对freqElement()函数所做更改的结果，该函数报告具有给定出现次数的所有条目，而不仅仅是第一个此类条目。 It isn't entirely desirable, but the The code includes some printing so that the progress can be monitored, which helps reassure that the code is working correctly. 这不是完全理想的，但是该代码包含一些打印内容，以便可以监视进度，这有助于确保代码正确运行。

Example output 输出示例

Processing [2]
Processing [3]
Processing [5]
Processing [6]
Processing [10]
Processing [10]
Processing [2]
Processing [5]
Processing [2]
Processing [8]
Processing [8]
Processing [7]
Processing [8]
Processing [7]
Processing [8]
Processing [7]
 value 3 is repeated 1 times 
 value 6 is repeated 1 times 
 value 5 is repeated 2 times 
 value 10 is repeated 2 times 
 value 2 is repeated 3 times 
 value 7 is repeated 3 times 
 value 8 is repeated 4 times

Answer 2

Let's start from the problem statement, since as I mentioned in a comment I don't think your current code addresses the problem: 让我们从问题陈述开始，因为正如我在评论中提到的那样，我认为您当前的代码无法解决问题：

Find the n most frequent element in an array. 在数组中找到n个最频繁的元素。

1 < N < 100000 [Length of Array] 1 <N <100000 [数组长度]

-1000000 < n < 1000000 [Array integers] -1000000 <n <1000000 [数组整数]

I take this to mean we want a function like this: 我的意思是说我们想要一个这样的函数：

size_t n_most_popular(int input[], size_t input_size, int output[], size_t output_size);

This function takes the input array (of size up to 100000) of integers (between -1000000 and 1000000), and populates the output array with the N most common elements from the input, where N is output_size . 此函数采用整数（在-1000000和1000000之间）的输入数组（大小最大为100000），并使用输入中的N个最常见的元素填充输出数组，其中N为output_size 。 For convenience we can stipulate that we will place the most common element at the front of the output, and less common elements toward the back. 为了方便起见，我们可以规定将最常见的元素放置在输出的前面，而将不常见的元素放置在输出的后面。

A straightforward approach to this would be to first sort the input array (possibly in place, possibly using standard qsort() ). 一种简单的方法是首先对输入数组进行排序（可能在适当的位置，可能使用标准qsort() ）。 Then you'll have an array like this: 然后，您将得到一个像这样的数组：

[1,1,1,1,2,2,3,3,3,3,3,4,7,...]

Then, make an array of structs where each struct contains a unique value from the input, plus the number of times it occurs. 然后，构造一个结构数组，其中每个结构都包含来自输入的唯一值及其出现的次数。 Maximum length of this is input_size , and it's trivial to build it in a single pass from the sorted input. 它的最大长度是input_size ，从排序后的输入一次通过构建它很简单。

Finally, sort this array of structs by the count field in descending order, using standard qsort() . 最后，使用标准qsort()按count字段降序对该结构数组进行排序。 Copy the first output_size elements to the output array, and return the actual populated size of the output array (which may be smaller than output_size if there are not enough unique values in the input array). 将第一个output_size元素复制到输出数组，然后返回输出数组的实际填充大小（如果输入数组中的唯一值不足，则该大小可能小于output_size ）。

Here's a working C program to do just that: 这是一个可以正常工作的C程序：

#include <string.h>
#include <stdio.h>
#include <stdlib.h>

size_t most_popular(int input[], size_t input_size, int output[], size_t output_size);

int main(void)
{
    int arr[] = {2, 3, 5, 6, 10, 10, 2, 5, 2};
    size_t len = sizeof(arr)/sizeof(int);

    int out[3];
    size_t outlen = sizeof(out)/sizeof(int);

    size_t count = most_popular(arr, len, out, outlen);

    for (size_t ii = 0; ii < count; ii++) {
        printf("most popular rank %lu: %d\n", ii+1, out[ii]);
    }

    return 0;
}

typedef struct
{
    int value;
    int count;
} value_count;

int value_count_greater(const void* lhs, const void* rhs)
{
    const value_count *vcl = lhs, *vcr = rhs;
    return vcr->count - vcl->count;
}

int int_less(const void *lhs, const void *rhs)
{
    const int *il = lhs, *ir = rhs;
    return *il - *ir;
}

// returns 0 if out of memory or input_size is 0, else returns valid portion of output                                                                                    
size_t most_popular(int input[], size_t input_size, int output[], size_t output_size)
{
    qsort(input, input_size, sizeof(input[0]), int_less);

    value_count* value_counts = malloc(input_size * sizeof(value_count));
    if (value_counts == NULL) {
        return 0;
    }

    // count how many times each value occurs in input                                                                                                                    
    size_t unique_count = 0;
    for (size_t ii = 0; ii < input_size; ii++) {
        if (ii == 0 || input[ii] != value_counts[unique_count-1].value) {
            value_counts[unique_count].value = input[ii];
            value_counts[unique_count].count = 1;
            unique_count++;
        } else {
            value_counts[unique_count-1].count++;
        }
    }

    // sort unique values by how often they occur, most popular first                                                                                                     
    qsort(value_counts, unique_count, sizeof(value_counts[0]), value_count_greater);

    size_t result_size = unique_count < output_size ? unique_count : output_size;
    for (size_t ii = 0; ii < result_size; ii++) {
        output[ii] = value_counts[ii].value;
    }

    free(value_counts);
    return result_size;
}

在C中的数组中找到n个重复数字

问题描述

2 个解决方案

解决方案1
1 已采纳 2016-05-01 17:18:52

Code 码

Example output 输出示例

解决方案2
0 2016-05-01 02:49:01

在C中的数组中找到n个重复数字

问题描述

2 个解决方案

解决方案1 1 已采纳 2016-05-01 17:18:52

Code 码

Example output 输出示例

解决方案2 0 2016-05-01 02:49:01

解决方案1
1 已采纳 2016-05-01 17:18:52

解决方案2
0 2016-05-01 02:49:01