HashTable top 20 count ideas

У меня немного проблемы с мыслью о решении моей проблемы. У меня есть программа подсчета слов, которая использует хэш-таблицу для подсчета всех слов в любом количестве файлов и печатает только слова , которые есть во всех файлах и их счет. Я также храню все мои использованные хэш-индексы в связанном списке.HashTable top 20 count ideas

Решил мою собственную проблему, я знал, что ответ будет прост. Я просто вычислил тот, у которого самый низкий счет, и если мое новое значение было больше, чем это, но оно указало индекс одного из самых низких значений в массиве из 20 структур слов.

Спасибо за вашу помощь всем!

 #include <unistd.h> 
    #include <stdio.h> 
    #include <stdlib.h> 
    #include <sys/types.h> 
    #include <sys/stat.h> 
    #include <fcntl.h> 
    #include <pthread.h> 
    #include <string.h> 


    /*Structures*/////////////////////////////////////////// 
    //A struct to hold the words in the hash tables and their 
    //counts 
    struct counter{ 
     int count; 
     int printed; 
     char word[51]; 
     int allfiles[101]; 
     struct counter * next; 
    }; 

    //A struct to hold the hash indexes of the already visited 
    //index, for easy printing 
    struct index{ 
     int used; 
     struct index * next; 
    }; 

    //A simple struct to pass arguments to the work function for 
    //threading 
    struct arguments{ 
     void * id; 
     int fileid; 
    }; 
    //////////////////////////////////////////////////////// 

    /*Functions*//////////////////////////////////////////// 
    static int hash(char * word); 
    static void inHash(struct counter * newWord, int hash, int FILEID); 
    static void indexchain(int hash); 
    //static void hashprint(int NUMFILES); 
    static void * work(struct arguments *); 
    static void toptwenty(int NUMFILES); 
    static void print(); 
    //////////////////////////////////////////////////////// 

    /*Global Variables*///////////////////////////////////// 
    struct counter * top[20] = {0}; 
    struct counter * hashtable[6222] = {0}; 
    struct index * head; 
    //////////////////////////////////////////////////////// 

    int main(int argc, char * argv[]) 
    { 

     //check for valid number of arguments 
     if(argc < 2) 
     { 
     fprintf(stderr, "invalid number of arguments"); 
     return 1; 
     } 


     //set up index chain starts with a null node 
     head = malloc(sizeof(struct index)); 
     head->next = NULL; 
     head->used = -1; 

     //loop through any number of files 
     int arg; 
     for(arg = 1; arg < argc; arg++) 
     { 
     struct arguments * argum = malloc(sizeof(struct arguments)); 
     argum->fileid = arg; 
     argum->id = ((void*)argv[arg]); 
     work(argum); 
     } 

     //hashprint(argc); 
     toptwenty(argc); 
     print(); 
     return 0; 
    } 


    /*Function Definitions*/ 
    //this function takes a file name and counts 
    //the words in the file 
    static void * work(struct arguments * argum) 
    { 
     int FILEID = argum->fileid; 
     void * in = argum->id; 
     int fd = open((char*)in, O_RDONLY); 
     if (fd == -1) 
     { 
     fprintf(stderr, "can't open %s for reading!\n", (char*)in); 
     exit(-1); 
     } 
     int BUFLEN = (int) lseek(fd, 0, SEEK_END); 
     lseek(fd, 0, 0); 
     //A few variable 
     char buf[BUFLEN + 1]; 
     int lastRead; 

     lastRead = read(fd, buf, BUFLEN); 
     if (lastRead == -1) 
     { 
      fprintf(stderr, "error reading file %s!\n", (char*)in); 
      exit(-1); 
     } 

     //Parse the filebuffer for words. 
     char newword[51]; 
     int c; 
     int curindex = 0; 

     buf[BUFLEN + 1] = ' '; 
     //not doing the last space because it is eof 
     for(c = 0; c < BUFLEN + 1; c++) 
     { 
     if((buf[c] >= 'A' && buf[c] <= 'Z')) 
     { 
      buf[c] += 32; 
     } 
     if(buf[c] >= 'a' && buf[c] <= 'z') 
     { 
      //add the next char to the string. 
      newword[curindex] = buf[c]; 
      curindex++; 
     } 
     else 
     { 
      //make a new struct for the entry, and add it to the hashtable 
      //add its hash to the 
      if(strlen(newword) >= 6) 
      { 
      struct counter * temp = malloc(sizeof(struct counter)); 
      strcpy(temp->word,newword); 
      int thishash = hash(temp->word); 

      //Only save hash indexes if they are in the first files 
      if(FILEID == 1) 
      {  
       indexchain(thishash); 
      } 
      inHash(temp, thishash, FILEID);   
      } 

      int wordlength = strlen(newword); 
      int i; 
      for(i = 0;i < wordlength; i++) 
      { 
      newword[i] = 0; 
      } 
      curindex = 0; 
     } 


     } 
     close(fd); 
     return in; 

    } 

    //Bad hash function by just adding ascii values of the 
    //characters 
    static int hash(char * word) 
    { 
     int loop = strlen(word); 
     int i; 
     int hashval = 0; 
     for(i = 0; i < loop; i++) 
     hashval += word[i]; 

     return hashval; 
    } 

    //add a new word to the hash table 
    static void inHash(struct counter * newWord, int hash, int FILEID) 
    { 
     int eflag = 0; 
     if(hashtable[hash] == NULL) 
     { 
     //if the entry isnt in the table 
     if(FILEID == 1) 
     { 
      newWord->allfiles[FILEID] = 1;      /*FILEID ARRAY TEST*/ 
      newWord->count = 1; 
      newWord->next = NULL; 
      hashtable[hash] = newWord; 
     } 
     } 
     else 
     { 
     //if its not, but what if it is? 
     struct counter * cur = hashtable[hash]; 
     if(strcmp(cur->word, newWord->word) == 0) 
     { 
      //is the word in the first slot? 
      cur->count += 1; 
      cur->allfiles[FILEID] = 1;      /*FILEID ARRAY TEST*/ 
      eflag = 1; 
     } 
     else 
     { 
      while(cur->next != NULL) 
      { 
      cur = cur->next; 
      if(strcmp(cur->word, newWord->word) == 0) 
      { 
       //if the word already exsists, update the count 
       cur->allfiles[FILEID] = 1;       /*FILEID ARRAY TEST*/ 
       cur->count += 1; 
       eflag = 1; 
       break; 
      } 

      } 
     } 

     //if its not in any bucket, make a new bucket 
     if(eflag == 0) 
     { 
      //Else add the new entry to the end of that list 
      if(FILEID == 1) 
      { 
      newWord->allfiles[FILEID] = 1;       /*FILEID ARRAY TEST*/ 
      newWord->count = 1; 
      newWord->next = NULL; 
      cur->next = newWord; 
      } 
     } 
     } 
    } 

    //adding a value to the linked list for printing 
    static void indexchain(int hash) 
    { 
     struct index * p = head; 
     int eflag = 0; 
     while(p->next != NULL) 
     { 
     if(p->used != hash) 
      p = p->next; 
     else 
     { 
      eflag = 1; 
      break; 
     } 
     } 
     if(eflag == 0) 
     { 
     struct index * newValue = malloc(sizeof(struct index)); 
     newValue->used = hash; 
     newValue->next = NULL; 
     p->next = newValue; 
     } 

    } 
    /* 
    //This function will print the values in the hash tables and their counts 
    //Prints based on number of files to check if words are in all files 
    static void hashprint(int NUMFILES) 
    { 
     struct index * p; 
     p = head->next; 
     int hash; 
     int i; 
     int printbool = 1; 
     while(p != NULL) 
     { 
     hash = p->used; 
     struct counter * ptr = hashtable[hash]; 

     while(ptr != NULL) 
     { 

      if(ptr->printed == 0) 
      { 
      for(i = 1; i < NUMFILES; i++) 
      { 
       if(ptr->allfiles[i] == 0) 
       { 
       printbool = 0; 
       break; 
       } 
       else 
       printbool = 1; 
      }  
      if(printbool == 1) 
      { 
       ptr->printed = 1; 
       printf("%s %d\n", ptr->word, ptr->count); 
      }   
      } 
      ptr = ptr->next;   
      } 
      p = p->next; 
     } 


    } 
    */ 
    //A function to see which numbers have the top twenty highest count 
    static void toptwenty(int NUMFILES) 
    { 
     struct index * p; 
     p = head->next; 
     int hash; 
     int i; 
     int printbool = 1; 
     while(p != NULL) 
     { 
     hash = p->used; 
     struct counter * ptr = hashtable[hash]; 

     while(ptr != NULL) 
     { 

      if(ptr->printed == 0) 
      { 
      for(i = 1; i < NUMFILES; i++) 
      { 
       if(ptr->allfiles[i] == 0) 
       { 
       printbool = 0; 
       break; 
       } 
       else 
       printbool = 1; 
      }  
      if(printbool == 1) 
      { 
       for(i = 0; i < 20; i++) 
       { 
       if(top[i] == NULL) 
       { 
        top[i] = ptr; 
        break; 
       } 
       else if(ptr->count > top[i]->count) 
       { 
        top[i] = ptr; 
        break;    
       } 

       } 
      }   
      } 
      ptr = ptr->next;   
      } 
      p = p->next; 
     } 
    } 
    //print the top 20 count 
    static void print() 
    { 
    int i; 
    for(i = 0; i < 20; i++) 
    { 
     if(top[i] != NULL) 
     { 
     if(top[i]->printed == 0) 
     { 
      //printf("%s\n", top[i]->word); 
      printf("%s %d\n", top[i]->word, top[i]->count); 
      top[i]->printed = 1; 
     } 
     } 
     else 
     break; 
    } 
    }

источник

2012-03-25 user1088450

Можете ли вы, возможно, дать более подробную информацию о том, какой текст вам нужен для разбора. И сколько это будет. – Azrael3000

Вы бы разместили код, который у вас есть? Как записывать файлы, в которых находятся слова? – gbulmer

Создайте очередь приоритетов, которая содержит 20 индексов хеширования, которые имеют верхние счета и их соответствующие значения.

При подсчете наименьшее значение находится в верхней части очереди, если ваше новое слово бьет его удалить его из очереди O(1) и добавить новый в очереди O(log(n)) которая только O(log(20)).

источник

2012-03-25 23:03:11 twain249

HashTable top 20 count ideas

ответ

Смежные вопросы