简体   繁体   中英

Find n repeating numbers in an array in C

I have implemented the problem statement below using the hashtable library functions in C. Since I have never used standard library hashtable in C, my questions are:

  1. Am I using the hashtable functions correctly (I believe getting output doesn't mean right usage)?
  2. Is there any better way to achieve the solution for the given problem statement?

Problem statement: Find the n most frequent element in an array.

  • 1 < N < 100000 [ Length of Array]
  • -1000000 < n < 1000000 [ Array integers]

I have gone through some of the similar questions at SO - and in one of the answers I do see the recommended approach being to use a hashtable.

#include <stdio.h>
#include <stdlib.h>
#include <search.h>
#include <stdbool.h>

#define REPEAT 3
#define BUFFERSIZE 10

void freqElement(int* arr, int len, int times);
int createHT(int* arr, int len);

int main(void)
{
    int arr[] = {2, 3, 5, 6, 10, 10, 2, 5, 2};
    int len = sizeof(arr)/sizeof(int);
    ENTRY e;
    ENTRY *ep;

    hcreate(len);

    if (!createHT(arr, len))
    {
        printf(" error in entering data \n");
    }

    freqElement(arr, len, REPEAT);

    hdestroy();
    return 0;
}

int createHT(int* arr, int len)
{
    ENTRY e, *ep;

    for(int i = 0; i < len; i++)
    {
        char buffer[BUFFERSIZE];
        snprintf(buffer, BUFFERSIZE, "%d", arr[i]);
        e.key = buffer;
        e.data = (void *)1;

        ep = hsearch(e, FIND);
        if (ep)
        {
            ep->data = (void *)((int)ep->data + (int)e.data);
        }
        ep = hsearch(e, ENTER);
        if (ep == NULL)
        {
            fprintf(stderr, "entry failed\n");
            exit(EXIT_FAILURE);
        }
    }
    return 1;
}

void freqElement(int* arr, int len, int times)
{
   ENTRY *ep, e;

   for (int i = 0; i < len; i++)
   {
       char buffer[BUFFERSIZE];
       snprintf(buffer, BUFFERSIZE, "%d", arr[i]);
       e.key = buffer;
       ep = hsearch(e, FIND);
       if(ep)
       {
           if((int)ep->data == times)
           {
               printf(" value %s is repeated %d times \n", ep->key, times);
               break;
           }
       }
   }

}

I'm not sure I'd use the hcreate() , hsearch() , hdestroy() triad of functions for this task, but it can be used. The POSIX specification is not clear on some issues, such as the release of the keys by htdestroy() , but the Mac OS X manual says:

The hdestroy() function disposes of the search table, and may be followed by another call to hcreate() . After the call to hdestroy() , the data can no longer be considered accessible. The hdestroy() function calls free(3) for each comparison key in the search table but not the data item associated with the key.

(POSIX doesn't mention hdestroy() calling free() on the comparison keys.)

Here's a relatively simple adaptation of your code that works and runs cleanly under valgrind , at least with GCC 6.1.0 and Valgrind 3.12.0-SVN on Mac OS X 10.11.4.

$ gcc -O3 -g -std=c11 -Wall -Wextra -Wmissing-prototypes \
>     -Wstrict-prototypes -Wold-style-definition -Werror hs17.c -o hs17
$

Code

#include <search.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define BUFFERSIZE 10

void freqElement(int *arr, int len, int times);
int createHT(int *arr, int len);

int main(void)
{
    int arr[] = { 2, 3, 5, 6, 10, 10, 2, 5, 2, 8, 8, 7, 8, 7, 8, 7, };
    int len = sizeof(arr) / sizeof(int);

    if (hcreate(len) == 0)
        fprintf(stderr, "Failed to create hash table of size %d\n", len);
    else
    {
        if (!createHT(arr, len))
            fprintf(stderr, "error in entering data\n");
        else
        {
            for (int i = 1; i < len; i++)
                freqElement(arr, len, i);
        }

        hdestroy();
    }
    return 0;
}

int createHT(int *arr, int len)
{
    ENTRY e, *ep;

    for (int i = 0; i < len; i++)
    {
        char buffer[BUFFERSIZE];
        snprintf(buffer, sizeof(buffer), "%d", arr[i]);
        e.key = strdup(buffer);
        e.data = (void *)0;
        printf("Processing [%s]\n", e.key);

        ep = hsearch(e, ENTER);
        if (ep)
        {
            ep->data = (void *)((intptr_t)ep->data + 1);
            if (ep->key != e.key)
                free(e.key);
        }
        else
        {
            fprintf(stderr, "entry failed for [%s]\n", e.key);
            free(e.key);    // Not dreadfully important
            exit(EXIT_FAILURE);
        }
    }
    return 1;
}

// Check whether this number has been processed before
static bool processed_before(int *arr, int len, int value)
{
    for (int j = 0; j < len; j++)
    {
        if (value == arr[j])
            return true;
    }
    return false;
}

void freqElement(int *arr, int len, int times)
{
    ENTRY *ep, e;

    for (int i = 0; i < len; i++)
    {
        char buffer[BUFFERSIZE];
        snprintf(buffer, BUFFERSIZE, "%d", arr[i]);
        e.key = buffer;
        ep = hsearch(e, FIND);
        if (ep)
        {
            if ((intptr_t)ep->data == times && !processed_before(arr, i, arr[i]))
                printf(" value %s is repeated %d times\n", ep->key, times);
        }
    }
}

The processed_before() function prevents values with multiple entries being printed multiple times — it's a consequence of the change to the freqElement() function that reports on all the entries with the given number of appearances, rather than just the first such entry. It isn't entirely desirable, but the The code includes some printing so that the progress can be monitored, which helps reassure that the code is working correctly.

Example output

Processing [2]
Processing [3]
Processing [5]
Processing [6]
Processing [10]
Processing [10]
Processing [2]
Processing [5]
Processing [2]
Processing [8]
Processing [8]
Processing [7]
Processing [8]
Processing [7]
Processing [8]
Processing [7]
 value 3 is repeated 1 times 
 value 6 is repeated 1 times 
 value 5 is repeated 2 times 
 value 10 is repeated 2 times 
 value 2 is repeated 3 times 
 value 7 is repeated 3 times 
 value 8 is repeated 4 times 

Let's start from the problem statement, since as I mentioned in a comment I don't think your current code addresses the problem:

Find the n most frequent element in an array.

1 < N < 100000 [Length of Array]

-1000000 < n < 1000000 [Array integers]

I take this to mean we want a function like this:

size_t n_most_popular(int input[], size_t input_size, int output[], size_t output_size);

This function takes the input array (of size up to 100000) of integers (between -1000000 and 1000000), and populates the output array with the N most common elements from the input, where N is output_size . For convenience we can stipulate that we will place the most common element at the front of the output, and less common elements toward the back.

A straightforward approach to this would be to first sort the input array (possibly in place, possibly using standard qsort() ). Then you'll have an array like this:

[1,1,1,1,2,2,3,3,3,3,3,4,7,...]

Then, make an array of structs where each struct contains a unique value from the input, plus the number of times it occurs. Maximum length of this is input_size , and it's trivial to build it in a single pass from the sorted input.

Finally, sort this array of structs by the count field in descending order, using standard qsort() . Copy the first output_size elements to the output array, and return the actual populated size of the output array (which may be smaller than output_size if there are not enough unique values in the input array).

Here's a working C program to do just that:

#include <string.h>
#include <stdio.h>
#include <stdlib.h>

size_t most_popular(int input[], size_t input_size, int output[], size_t output_size);

int main(void)
{
    int arr[] = {2, 3, 5, 6, 10, 10, 2, 5, 2};
    size_t len = sizeof(arr)/sizeof(int);

    int out[3];
    size_t outlen = sizeof(out)/sizeof(int);

    size_t count = most_popular(arr, len, out, outlen);

    for (size_t ii = 0; ii < count; ii++) {
        printf("most popular rank %lu: %d\n", ii+1, out[ii]);
    }

    return 0;
}

typedef struct
{
    int value;
    int count;
} value_count;

int value_count_greater(const void* lhs, const void* rhs)
{
    const value_count *vcl = lhs, *vcr = rhs;
    return vcr->count - vcl->count;
}

int int_less(const void *lhs, const void *rhs)
{
    const int *il = lhs, *ir = rhs;
    return *il - *ir;
}

// returns 0 if out of memory or input_size is 0, else returns valid portion of output                                                                                    
size_t most_popular(int input[], size_t input_size, int output[], size_t output_size)
{
    qsort(input, input_size, sizeof(input[0]), int_less);

    value_count* value_counts = malloc(input_size * sizeof(value_count));
    if (value_counts == NULL) {
        return 0;
    }

    // count how many times each value occurs in input                                                                                                                    
    size_t unique_count = 0;
    for (size_t ii = 0; ii < input_size; ii++) {
        if (ii == 0 || input[ii] != value_counts[unique_count-1].value) {
            value_counts[unique_count].value = input[ii];
            value_counts[unique_count].count = 1;
            unique_count++;
        } else {
            value_counts[unique_count-1].count++;
        }
    }

    // sort unique values by how often they occur, most popular first                                                                                                     
    qsort(value_counts, unique_count, sizeof(value_counts[0]), value_count_greater);

    size_t result_size = unique_count < output_size ? unique_count : output_size;
    for (size_t ii = 0; ii < result_size; ii++) {
        output[ii] = value_counts[ii].value;
    }

    free(value_counts);
    return result_size;
}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM