/* ScummVM - Graphic Adventure Engine * * ScummVM is the legal property of its developers, whose names * are too numerous to list here. Please refer to the COPYRIGHT * file distributed with this source distribution. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * $URL$ * $Id$ * */ /* Main vocabulary support functions and word lookup */ #include "sci/include/sciresource.h" #include "sci/include/engine.h" #include "sci/include/kernel.h" int vocab_version; #define VOCAB_RESOURCE_PARSE_TREE_BRANCHES vocab_version==1 ? \ VOCAB_RESOURCE_SCI1_PARSE_TREE_BRANCHES : \ VOCAB_RESOURCE_SCI0_PARSE_TREE_BRANCHES #define VOCAB_RESOURCE_SUFFIX_VOCAB vocab_version==1 ? \ VOCAB_RESOURCE_SCI1_SUFFIX_VOCAB : \ VOCAB_RESOURCE_SCI0_SUFFIX_VOCAB const char *class_names[] = {"", /* These strange names were taken from an SCI01 interpreter */ "", "conj", /* conjunction */ "ass", /* ? */ "pos", /* preposition ? */ "art", /* article */ "adj", /* adjective */ "pron", /* pronoun */ "noun", /* noun */ "auxv", /* auxillary verb */ "adv", /* adverb */ "verb", /* verb */ "", "", "", "" }; int _vocab_cmp_words(const void *word1, const void *word2) { return scumm_stricmp((*((word_t **) word1))->word, (*((word_t **) word2))->word); } word_t ** vocab_get_words(ResourceManager *resmgr, int *word_counter) { int counter = 0; unsigned int seeker; word_t **words; char currentword[256] = ""; /* They're not going to use words longer than 255 ;-) */ int currentwordpos = 0; resource_t *resource; /* First try to load the SCI0 vocab resource. */ resource = scir_find_resource(resmgr, sci_vocab, VOCAB_RESOURCE_SCI0_MAIN_VOCAB, 0); vocab_version = 0; if (!resource) { warning("SCI0: Could not find a main vocabulary, trying SCI01.\n"); resource = scir_find_resource(resmgr, sci_vocab, VOCAB_RESOURCE_SCI1_MAIN_VOCAB, 0); vocab_version = 1; } if (!resource) { warning("SCI1: Could not find a main vocabulary"); return NULL; /* NOT critical: SCI1 games and some demos don't have one! */ } if (vocab_version == 1) seeker = 255 * 2; /* vocab.900 starts with 255 16-bit pointers which we don't use */ else seeker = 26 * 2; /* vocab.000 starts with 26 16-bit pointers which we don't use */ if (resource->size < seeker) { error("Invalid main vocabulary encountered: Too small\n"); return NULL; /* Now this ought to be critical, but it'll just cause parse() and said() not to work */ } words = (word_t**)sci_malloc(sizeof(word_t *)); while (seeker < resource->size) { byte c; words = (word_t**)sci_realloc(words, (counter + 1) * sizeof(word_t *)); currentwordpos = resource->data[seeker++]; /* Parts of previous words may be re-used */ if (vocab_version == 1) { c = 1; while (seeker < resource->size && currentwordpos < 255 && c) { c = resource->data[seeker++]; currentword[currentwordpos++] = c; } if (seeker == resource->size) { warning("SCI1: Vocabulary not usable, disabling.\n"); vocab_free_words(words, counter); return NULL; } } else { do { c = resource->data[seeker++]; currentword[currentwordpos++] = c & 0x7f; /* 0x80 is used to terminate the string */ } while (c < 0x80); } currentword[currentwordpos] = 0; words[counter] = (word_t*)sci_malloc(sizeof(word_t) + currentwordpos); /* Allocate more memory, so that the word fits into the structure */ strcpy(&(words[counter]->word[0]), &(currentword[0])); /* Copy the word */ /* Now decode class and group: */ c = resource->data[seeker + 1]; words[counter]->w_class = ((resource->data[seeker]) << 4) | ((c & 0xf0) >> 4); words[counter]->group = (resource->data[seeker + 2]) | ((c & 0x0f) << 8); seeker += 3; ++counter; } *word_counter = counter; qsort(words, counter, sizeof(word_t *), _vocab_cmp_words); /* Sort entries */ return words; } void vocab_free_words(word_t **words, int words_nr) { int i; for (i = 0; i < words_nr; i++) free(words[i]); free(words); } const char * vocab_get_any_group_word(int group, word_t **words, int words_nr) { int i; if (group == VOCAB_MAGIC_NUMBER_GROUP) return "{number}"; for (i = 0; i < words_nr; i++) if (words[i]->group == group) return words[i]->word; return "{invalid}"; } static inline unsigned int inverse_16(unsigned int foo) { return (((foo & 0xff) << 8) | ((foo & 0xff00) >> 8)); } suffix_t ** vocab_get_suffices(ResourceManager *resmgr, int *suffices_nr) { int counter = 0; suffix_t **suffices; resource_t *resource = scir_find_resource(resmgr, sci_vocab, VOCAB_RESOURCE_SUFFIX_VOCAB, 1); unsigned int seeker = 1; if (!resource) { error("Could not find suffix vocabulary"); return NULL; /* Not critical */ } suffices = (suffix_t**)sci_malloc(sizeof(suffix_t *)); while ((seeker < resource->size - 1) && (resource->data[seeker + 1] != 0xff)) { char *alt_suffix = (char *) resource->data + seeker; int alt_len = strlen(alt_suffix); char *word_suffix; int word_len; suffices = (suffix_t**)sci_realloc(suffices, sizeof(suffix_t *) * (counter + 1)); seeker += alt_len + 1; /* Hit end of string */ word_suffix = (char *) resource->data + seeker + 3; /* Beginning of next string +1 (ignore '*') */ word_len = strlen(word_suffix); suffices[counter] = (suffix_t*)sci_malloc(sizeof(suffix_t)); /* allocate enough memory to store the strings */ suffices[counter]->word_suffix = word_suffix; suffices[counter]->alt_suffix = alt_suffix; suffices[counter]->alt_suffix_length = alt_len; suffices[counter]->word_suffix_length = word_len; suffices[counter]->class_mask = inverse_16(getInt16(resource->data + seeker)); /* Inverse endianness */ seeker += word_len + 4; suffices[counter]->result_class = inverse_16(getInt16(resource->data + seeker)); seeker += 3; /* Next entry */ ++counter; } *suffices_nr = counter; return suffices; } void vocab_free_suffices(ResourceManager *resmgr, suffix_t **suffices, int suffices_nr) { int i; scir_unlock_resource(resmgr, scir_find_resource(resmgr, sci_vocab, VOCAB_RESOURCE_SUFFIX_VOCAB, 0), VOCAB_RESOURCE_SUFFIX_VOCAB, sci_vocab); for (i = 0; i < suffices_nr; i++) free(suffices[i]); free(suffices); } void vocab_free_branches(parse_tree_branch_t *parser_branches) { if (parser_branches) free(parser_branches); } parse_tree_branch_t * vocab_get_branches(ResourceManager * resmgr, int *branches_nr) { resource_t *resource = scir_find_resource(resmgr, sci_vocab, VOCAB_RESOURCE_PARSE_TREE_BRANCHES, 0); parse_tree_branch_t *retval; int i; if (!resource) { error("No parser tree data found"); return NULL; } *branches_nr = resource->size / 20; if (*branches_nr == 0) { error("Parser tree data is empty"); return NULL; } retval = (parse_tree_branch_t*)sci_malloc(sizeof(parse_tree_branch_t) * *branches_nr); for (i = 0; i < *branches_nr; i++) { int k; byte *base = resource->data + i * 20; retval[i].id = getInt16(base); for (k = 0; k < 9; k++) retval[i].data[k] = getUInt16(base + 2 + 2 * k); retval[i].data[9] = 0; /* Always terminate */ } if (!retval[*branches_nr - 1].id) /* branch lists may be terminated by empty rules */ --(*branches_nr); return retval; } result_word_t * vocab_lookup_word(char *word, int word_len, word_t **words, int words_nr, suffix_t **suffices, int suffices_nr) { word_t *tempword = (word_t*)sci_malloc(sizeof(word_t) + word_len + 256); /* 256: For suffices. Should suffice. */ word_t **dict_word; result_word_t *retval; char *tester; int i, word_len_tmp; strncpy(&(tempword->word[0]), word, word_len); tempword->word[word_len] = 0; word_len_tmp = word_len; while ((tester = strchr(tempword->word, '-'))) memmove(tester, tester + 1, (tempword->word + word_len_tmp--) - tester); retval = (result_word_t*)sci_malloc(sizeof(result_word_t)); dict_word = (word_t**)bsearch(&tempword, words, words_nr, sizeof(word_t *), _vocab_cmp_words); if (dict_word) { free(tempword); retval->w_class = (*dict_word)->w_class; retval->group = (*dict_word)->group; return retval; } /* Now try all suffices */ for (i = 0; i < suffices_nr; i++) if (suffices[i]->alt_suffix_length <= word_len) { int suff_index = word_len - suffices[i]->alt_suffix_length; /* Offset of the start of the suffix */ if (scumm_strnicmp(suffices[i]->alt_suffix, word + suff_index, suffices[i]->alt_suffix_length) == 0) { /* Suffix matched! */ strncpy(&(tempword->word[0]), word, word_len); tempword->word[suff_index] = 0; /* Terminate word at suffix start position... */ strncat(&(tempword->word[0]), suffices[i]->word_suffix, suffices[i]->word_suffix_length); /* ...and append "correct" suffix */ dict_word = (word_t**)bsearch(&tempword, words, words_nr, sizeof(word_t *), _vocab_cmp_words); if ((dict_word) && ((*dict_word)->w_class & suffices[i]->class_mask)) { /* Found it? */ free(tempword); retval->w_class = suffices[i]->result_class; /* Use suffix class */ retval->group = (*dict_word)->group; return retval; } } } /* No match so far? Check if it's a number. */ strncpy(&(tempword->word[0]), word, word_len); tempword->word[word_len] = 0; word_len_tmp = word_len; while ((tester = strchr(tempword->word, '-'))) memmove(tester, tester + 1, (tempword->word + word_len--) - tester); if ((strtol(&(tempword->word[0]), &tester, 10) >= 0) && (*tester == '\0')) { /* Do we have a complete number here? */ free(tempword); retval->group = VOCAB_MAGIC_NUMBER_GROUP; retval->w_class = VOCAB_CLASS_NUMBER; return(retval); } free(tempword); free(retval); return NULL; } int vocab_get_said_spec_length(byte *addr) { int result = 0; while (*addr != 0xff) { if (*addr < 0xf0) { result += 2; addr += 2; } else { result += 1; addr += 1; } } return result + 1; } void vocab_decypher_said_block(state_t *s, byte *addr) { int nextitem; do { nextitem = *addr++; if (nextitem < 0xf0) { nextitem = nextitem << 8 | *addr++; sciprintf(" %s[%03x]", vocab_get_any_group_word(nextitem, s->parser_words, s->parser_words_nr), nextitem); nextitem = 42; /* Make sure that group 0xff doesn't abort */ } else switch (nextitem) { case 0xf0: sciprintf(" ,"); break; case 0xf1: sciprintf(" &"); break; case 0xf2: sciprintf(" /"); break; case 0xf3: sciprintf(" ("); break; case 0xf4: sciprintf(" )"); break; case 0xf5: sciprintf(" ["); break; case 0xf6: sciprintf(" ]"); break; case 0xf7: sciprintf(" #"); break; case 0xf8: sciprintf(" <"); break; case 0xf9: sciprintf(" >"); break; case 0xff: break; } } while (nextitem != 0xff); sciprintf("\n"); } #ifdef SCI_SIMPLE_SAID_CODE static short _related_words[][2] = { /* 0 is backwards, 1 is forward */ {0x800, 0x180}, /* preposition */ {0x000, 0x180}, /* article */ {0x000, 0x180}, /* adjective */ {0x800, 0x000}, /* pronoun */ {0x800, 0x180}, /* noun */ {0x000, 0x800}, /* auxiliary verb */ {0x800, 0x800}, /* adverb */ {0x000, 0x180}, /* verb */ {0x000, 0x180} /* number */ }; int vocab_build_simple_parse_tree(parse_tree_node_t *nodes, result_word_t *words, int words_nr) { int i, length, pos = 0; for (i = 0; i < words_nr; ++i) { if (words[i].classID != VOCAB_CLASS_ANYWORD) { nodes[pos].type = words[i].classID; nodes[pos].content.value = words[i].group; pos += 2; /* Link information is filled in below */ } } nodes[pos].type = -1; /* terminate */ length = pos >> 1; /* now find all referenced words */ #ifdef SCI_SIMPLE_SAID_DEBUG sciprintf("Semantic references:\n"); #endif for (i = 0; i < length; i++) { int j; int searchmask; int type; pos = (i << 1); type = sci_ffs(nodes[pos].type); if (type) { int found = -1; type -= 5; /* 1 because ffs starts counting at 1, 4 because nodes[pos].type is a nibble off */ if (type < 0) type = 0; #ifdef SCI_SIMPLE_SAID_DEBUG sciprintf("#%d: Word %04x: type %04x\n", i, nodes[pos].content.value, type); #endif /* search backwards */ searchmask = _related_words[type][0]; if (searchmask) { for (j = i - 1; j >= 0; j--) if (nodes[j << 1].type & searchmask) { found = j << 1; break; } } nodes[pos+1].content.branches[0] = found; #ifdef SCI_SIMPLE_SAID_DEBUG if (found > -1) sciprintf(" %d <\n", found >> 1); #endif /* search forward */ found = -1; searchmask = _related_words[type][1]; if (searchmask) { for (j = i + 1; j < length; j++) if (nodes[j << 1].type & searchmask) { found = j << 1; break; } } #ifdef SCI_SIMPLE_SAID_DEBUG if (found > -1) sciprintf(" > %d\n", found >> 1); #endif } else { #ifdef SCI_SIMPLE_SAID_DEBUG sciprintf("#%d: Untypified word\n", i); /* Weird, but not fatal */ #endif nodes[pos+1].content.branches[0] = -1; nodes[pos+1].content.branches[1] = -1; } } #ifdef SCI_SIMPLE_SAID_DEBUG sciprintf("/Semantic references.\n"); #endif return 0; } #endif result_word_t * vocab_tokenize_string(char *sentence, int *result_nr, word_t **words, int words_nr, suffix_t **suffices, int suffices_nr, char **error) { char *lastword = sentence; int pos_in_sentence = 0; char c; int wordlen = 0; result_word_t *retval = (result_word_t*)sci_malloc(sizeof(result_word_t)); /* malloc'd size is always one result_word_t too big */ result_word_t *lookup_result; *result_nr = 0; *error = NULL; do { c = sentence[pos_in_sentence++]; if (isalnum(c) || (c == '-' && wordlen)) ++wordlen; /* Continue on this word */ /* Words may contain a '-', but may not ** start with one. */ else { if (wordlen) { /* Finished a word? */ lookup_result = vocab_lookup_word(lastword, wordlen, words, words_nr, suffices, suffices_nr); /* Look it up */ if (!lookup_result) { /* Not found? */ *error = (char*)sci_calloc(wordlen + 1, 1); strncpy(*error, lastword, wordlen); /* Set the offending word */ free(retval); return NULL; /* And return with error */ } memcpy(retval + *result_nr, lookup_result, sizeof(result_word_t)); /* Copy into list */ ++(*result_nr); /* Increase number of resulting words */ free(lookup_result); retval = (result_word_t*)sci_realloc(retval, sizeof(result_word_t) * (*result_nr + 1)); } lastword = sentence + pos_in_sentence; wordlen = 0; } } while (c); /* Until terminator is hit */ if (*result_nr == 0) { free(retval); return NULL; } return retval; } void _vocab_recursive_ptree_dump_treelike(parse_tree_node_t *nodes, int nr, int prevnr) { if ((nr > VOCAB_TREE_NODES)/* || (nr < prevnr)*/) { sciprintf("Error(%04x)", nr); return; } if (nodes[nr].type == PARSE_TREE_NODE_LEAF) /* sciprintf("[%03x]%04x", nr, nodes[nr].content.value); */ sciprintf("%x", nodes[nr].content.value); else { int lbranch = nodes[nr].content.branches[0]; int rbranch = nodes[nr].content.branches[1]; /* sciprintf("<[%03x]",nr); */ sciprintf("<"); if (lbranch) _vocab_recursive_ptree_dump_treelike(nodes, lbranch, nr); else sciprintf("NULL"); sciprintf(","); if (rbranch) _vocab_recursive_ptree_dump_treelike(nodes, rbranch, nr); else sciprintf("NULL"); sciprintf(">"); } } void _vocab_recursive_ptree_dump(parse_tree_node_t *nodes, int nr, int prevnr, int blanks) { int lbranch = nodes[nr].content.branches[0]; int rbranch = nodes[nr].content.branches[1]; int i; if (nodes[nr].type == PARSE_TREE_NODE_LEAF) { sciprintf("vocab_dump_parse_tree: Error: consp is nil for element %03x\n", nr); return; } if ((nr > VOCAB_TREE_NODES)/* || (nr < prevnr)*/) { sciprintf("Error(%04x))", nr); return; } if (lbranch) { if (nodes[lbranch].type == PARSE_TREE_NODE_BRANCH) { sciprintf("\n"); for (i = 0; i < blanks; i++) sciprintf(" "); sciprintf("("); _vocab_recursive_ptree_dump(nodes, lbranch, nr, blanks + 1); sciprintf(")\n"); for (i = 0; i < blanks; i++) sciprintf(" "); } else sciprintf("%x", nodes[lbranch].content.value); sciprintf(" "); }/* else sciprintf ("nil"); */ if (rbranch) { if (nodes[rbranch].type == PARSE_TREE_NODE_BRANCH) _vocab_recursive_ptree_dump(nodes, rbranch, nr, blanks); else sciprintf("%x", nodes[rbranch].content.value); }/* else sciprintf("nil");*/ } void vocab_dump_parse_tree(const char *tree_name, parse_tree_node_t *nodes) { /* _vocab_recursive_ptree_dump_treelike(nodes, 0, 0); */ sciprintf("(setq %s \n'(", tree_name); _vocab_recursive_ptree_dump(nodes, 0, 0, 1); sciprintf("))\n"); } void vocab_synonymize_tokens(result_word_t *words, int words_nr, synonym_t *synonyms, int synonyms_nr) { int i, sync; if (!synonyms || !synonyms_nr) return; /* No synonyms: Nothing to check */ for (i = 0; i < words_nr; i++) for (sync = 0; sync < synonyms_nr; sync++) if (words[i].group == synonyms[sync].replaceant) words[i].group = synonyms[sync].replacement; }