bool read_document(PhraseLargeTable3 * phrase_table, FacadePhraseIndex * phrase_index, FILE * document, HashofDocument hash_of_document, HashofUnigram hash_of_unigram){ char * linebuf = NULL;size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while ( getline(&linebuf, &size, document) ){ if ( feof(document) ) break; if ( '/n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '/0'; } TAGLIB_PARSE_SEGMENTED_LINE(phrase_index, token, linebuf); last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; gpointer value = NULL; gboolean lookup_result = g_hash_table_lookup_extended (hash_of_unigram, GUINT_TO_POINTER(cur_token), NULL, &value); if ( !lookup_result ){ g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token), GUINT_TO_POINTER(1)); } else { guint32 freq = GPOINTER_TO_UINT(value); freq ++; g_hash_table_insert(hash_of_unigram, GUINT_TO_POINTER(cur_token), GUINT_TO_POINTER(freq)); } /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !g_train_pi_gram ) continue; last_token = sentence_start; } /* remember the (last_token, cur_token) word pair. */ HashofSecondWord hash_of_second_word = NULL; lookup_result = g_hash_table_lookup_extended (hash_of_document, GUINT_TO_POINTER(last_token), NULL, &value); if ( !lookup_result ){ hash_of_second_word = g_hash_table_new (g_direct_hash, g_direct_equal); } else { hash_of_second_word = (HashofSecondWord) value; } value = NULL; lookup_result = g_hash_table_lookup_extended (hash_of_second_word, GUINT_TO_POINTER(cur_token), NULL, &value); guint32 count = 0; if ( lookup_result ) { count = GPOINTER_TO_UINT(value); } count ++; g_hash_table_insert(hash_of_second_word, GUINT_TO_POINTER(cur_token), GUINT_TO_POINTER(count)); g_hash_table_insert(hash_of_document, GUINT_TO_POINTER(last_token), hash_of_second_word); } free(linebuf); return true;}