首页 > 解决方案 > 打印给定文本文件中出现频率最高的单词,无法在 C 中按频率排序

问题描述

我正在做一项作业,要求我打印给定文本文件中出现频率最高的 10 个单词。我的代码正在打印文件中的单词,但它没有根据它们的频率对它们进行排序。

下面是我的代码。我使用哈希表来存储每个唯一单词及其频率。我目前正在使用我编写的 wordcmp 函数对单词进行排序,并在 main 的内置 qsort 函数中调用它。

如果有人可以指导我解决我的错误,我将非常感激。

我当前的输出:

前 10 个单词(共 10 个)是:

1 分钟

1 是

1 再次

3 开心

2 你好

1 如何

1 让

1 你

1 次尝试

1 这个

预期输出(我想要的):

前 10 个单词(共 10 个)是:

3 开心

2 你好

1 你

1 次尝试

1 这个

1 让

1 分钟

1 如何

1 是

1 再次

这是我的一些代码:

typedef struct word
{ 
  char *s;          /* the word */
  int count;        /* number of times word occurs */
  struct word* next;
}word;

struct hashtable
{
  word **table;
  int tablesize;
  int currentsize;
};
typedef struct hashtable hashtable;
int main(int argc, char *argv[])
{

    int top_words = 10;
    word *word = NULL;
    hashtable *hash = ht_create(5000);
    char *file_name;
    char *file_word;
    FILE *fp;
    struct word *present = NULL;

    fp = fopen (file_name, "r");
    if (fp == NULL)
    {
        fprintf (stderr,"%s: No such file or directory\n", file_name);
        fprintf(stderr,"The top %d words (out of 0) are:\n", top_words); 
        exit(-1);
    }

    continue_program:
    while ((file_word = getWord(fp)))
    {
        word = add(hash, file_word, 1);
    }
    fclose(fp);

    qsort((void*)hash->table, hash->currentsize, sizeof(word),(int (*)(const void *, const void *)) wordcmp);

    if(top_words > total_unique_words)
          top_words = total_unique_words;

    printf("the top %d words (out of %d) are:\n", top_words, total_unique_words);

    int iterations =0;
    for(i =0; i <= hash->tablesize && iterations< top_words; i++)
    {
          present = hash->table[i];
          if(present != NULL)
          {
              printf("     %4d %s\n", present->count, present->s);
              present = present->next;
              iterations++;
          }
    }
    freetable(hash);

 return 0;
}

int wordcmp (word *a, word *b) 
{
    if (a != NULL && b!= NULL) {

    if (a->count < b->count) 
    {
      return +1;     
    }
    else if (a->count > b->count) 
    {
        return -1; 
    }
    else if (a->count == b->count)
    {
      /*return strcmp(b->s, a->s);*/
      return 0;
    }
  }
  return 0;
}

/* Create a new hashtable. */
struct hashtable *ht_create( int size ) 
{
  int i;

  if( size < 1 ) 
    return NULL;

  hashtable *table = (hashtable *) malloc(sizeof(hashtable));
  table->table = (word **) malloc(sizeof(word *) * size);

  if(table != NULL)
  {
      table->currentsize = 0;
      table->tablesize = size;
  }

  for( i = 0; i < size; i++ ) 
  {
    table->table[i] = NULL;
  }

  return table; 
}

/* Adds a new node to the hash table*/
word * add(hashtable *h, char *key, int freq) 
{
    int index = hashcode(key) % h->tablesize;
    word *current = h->table[index];

    /* Search for duplicate value */
    while(current != NULL) {
        if(contains(h, key) == 1){
            current->count++;
            return current;
       }
         current = current->next;
     }

    /* Create new node if no duplicate is found */
    word *newnode = (struct word*)malloc(sizeof(struct word));
    if(newnode!=NULL){
          newnode->s =strdup(key);
          newnode-> count = freq;
          newnode-> next = NULL;
    }
    h->table[index] = newnode;
    h->currentsize = h->currentsize + 1;
    total_unique_words++;
    return newnode;
}

标签: cstructhashhashtableqsort

解决方案


您面临的主要问题是尝试使用存储桶的链表对哈希表进行排序。当发生散列冲突时,您的表不会调整大小,您只需使用链表将导致冲突table[index]的单词存储在与已存储在那里的单词相同的链接处。就是add这样。

这很容易导致哈希表的内容如下所示:

table[ 0] = NULL
table[ 1] = foo
table[ 2] = NULL
table[ 3] = |some|->|words|->|that|->|collided|  /* chained bucket */
table[ 4] = other
table[ 5] = words
table[ 6] = NULL
table[ 7] = NULL
...

您不能简单地qsort列出并希望获得正确的词频。qsort无法知道这"some"只是链表中的开头单词,所有qsort获取都是指向"some"and的指针sizeof(word)

为了让生活更轻松,只需忘记哈希表,并使用动态分配的word**. 您可以使用类似的添加来增加重复项的出现次数,并避免链式存储桶的所有问题。(如果你为每个单词提供自动存储,它会给你一个简单free()的指针,你就完成了)

以下示例采用 2 个参数。第一个要从中读取单词的文件名,以及(可选)第二个整数值,将排序的输出限制为最高的单词数。该words_t结构使用word限制为 32 个字符的自动存储(未删节字典中的最大单词为 28 个字符)。您可以更改单词或阅读方式以解析输入并根据需要忽略标点符号和复数。以下在所有标点符号上分隔单词(连字符除外),并丢弃单词的复数形式(例如,它存储"Mike"遇到"Mike's"时,丢弃"'s"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>

#define MAXC   32   /* max word length is 28-char, 29-char is sufficient */
#define MAXW  128   /* initial maximum number of words to allocate */

typedef struct {
    char word[MAXC];    /* struct holding individual words */
    size_t ninst;       /* and the number of times they occur */
} words_t;

/*  function prototypes */
void *addword (words_t *words, const char *word, size_t *wc, size_t *maxw);
void *xrealloc (void *ptr, size_t psz, size_t *nelem);

/* qsort compare function for words_t (alphabetical) */
int cmpwrds (const void *a, const void *b)
{
    return strcmp (((words_t *)a)->word, ((words_t *)b)->word);
}

/* qsort compare function for words_t (by occurrence - descending)
 * and alphabetical (ascending) if occurrences are equal)
 */
int cmpinst (const void *a, const void *b)
{
    int ndiff =  (((words_t *)a)->ninst < ((words_t *)b)->ninst) - 
                (((words_t *)a)->ninst > ((words_t *)b)->ninst);

    if (ndiff)
        return ndiff;

    return strcmp (((words_t *)a)->word, ((words_t *)b)->word);
}

int main (int argc, char **argv) {

    int c = 0, nc = 0, prev = ' ', total = 0;
    size_t maxw = MAXW, wc = 0, top = 0;
    char buf[MAXC] = "";
    words_t *words = NULL;
    FILE *fp = fopen (argv[1], "r");

    if (!fp) {  /* validate file open for reading */
        fprintf (stderr, "error: file open failed '%s'.\n", argv[1]);
        return 1;
    }

    if (argc > 2) { /* if 2 args, convert argv[2] to number of top words */
        char *p = argv[2];
        size_t tmp = strtoul (argv[2], &p, 0);
        if (p != argv[2] && !errno)
            top = tmp;
    }

    /* allocate/validate initial words */
    if (!(words = calloc (maxw, sizeof *words))) {
        perror ("calloc-words");
        return 1;
    }

    while ((c = fgetc(fp)) != EOF) {        /* read each character in file */
        if (c != '-' && (isspace (c) || ispunct (c))) { /* word-end found */
            if (!isspace (prev) && !ispunct (prev) &&   /* multiple ws/punct */
                !(prev == 's' && nc == 1)) {            /* exclude "'s" */
                buf[nc] = 0;                            /* nul-terminate */
                words = addword (words, buf, &wc, &maxw);   /* add word */
                nc = 0;     /* reset char count */
            }
        }
        else if (nc < MAXC - 1) {   /* add char to buf */
            buf[nc++] = c;
        }
        else {  /* chars exceed MAXC - 1; storage capability of struct */
            fprintf (stderr, "error: characters exceed %d.\n", MAXC);
            return 1;
        }
        prev = c;   /* save previous char */
    }
    if (!isspace (prev) && !ispunct (prev))     /* handle non-POSIX end */
        words = addword (words, buf, &wc, &maxw);

    if (fp != stdin) fclose (fp);   /* close file if not stdin */

    qsort (words, wc, sizeof *words, cmpinst);  /* sort words by frequency */

    printf ("'%s' contained '%zu' words.\n\n",  /* output total No. words */
            fp == stdin ? "stdin" : argv[1], wc);

    /* output top words (or all words in descending order if top not given) */
    for (size_t i = 0; i < (top != 0 ? top : wc); i++) {
        printf ("  %-28s    %5zu\n", words[i].word, words[i].ninst);
        total += words[i].ninst;
    }
    printf ("%33s------\n%34s%5d\n", " ", "Total: ", total);

    free (words);

    return 0;
}

/** add word to words, updating pointer to word-count 'wc' and
 *  the maximum words allocated 'maxw' as needed. returns pointer
 *  to words (which must be assigned back in the caller).
 */
void *addword (words_t *words, const char *word, size_t *wc, size_t *maxw)
{
    size_t i;

    for (i = 0; i < *wc; i++)
        if (strcmp (words[i].word, word) == 0) {
            words[i].ninst++;
            return words;
        }

    if (*wc == *maxw)
        words = xrealloc (words, sizeof *words, maxw);

    strcpy (words[*wc].word, word);
    words[(*wc)++].ninst++;

    return words;
}

/** realloc 'ptr' of 'nelem' of 'psz' to 'nelem * 2' of 'psz'.
 *  returns pointer to reallocated block of memory with new
 *  memory initialized to 0/NULL. return must be assigned to
 *  original pointer in caller.
 */
void *xrealloc (void *ptr, size_t psz, size_t *nelem)
{   void *memptr = realloc ((char *)ptr, *nelem * 2 * psz);
    if (!memptr) {
        perror ("realloc(): virtual memory exhausted.");
        exit (EXIT_FAILURE);
    }   /* zero new memory (optional) */
    memset ((char *)memptr + *nelem * psz, 0, *nelem * psz);
    *nelem *= 2;
    return memptr;
}

注意:输出按出现的降序排序,如果单词出现相同的次数则按字母顺序排序)

示例使用/输出

$ ./bin/getchar_wordcnt_top dat/damages.txt 10
'dat/damages.txt' contained '109' words.

  the                                12
  a                                  10
  in                                  7
  of                                  7
  and                                 5
  anguish                             4
  injury                              4
  jury                                4
  mental                              4
  that                                4
                                 ------
                           Total:    61

注意:要将哈希表用作存储的基础,您必须至少创建一个指向哈希表中每个单词的指针数组,然后对指针数组进行排序。否则,您将需要复制存储并将单词复制到新数组中进行排序。(这在某种程度上是一种内存效率低下的方法)。为哈希表中的每个单词创建一个单独的指针数组以进行排序是您必须调用qsort并避免链式存储桶问题的唯一方法。


推荐阅读