scummvm/engines/sci/parser/grammar.cpp
Willem Jan Palenstijn f98536eef5 SCI: Allow multiple word groups in parser
In SCI01 and up, each typed word may be interpreted as multiple
class,group pairs. This patch adds support to the vocabulary and
parser. It uses the matcher support added in r52985.

This fixes parser issues in German LSL3, but needs testing.

svn-id: r52989
2010-10-03 10:49:42 +00:00

640 lines
17 KiB
C++

/* ScummVM - Graphic Adventure Engine
*
* ScummVM is the legal property of its developers, whose names
* are too numerous to list here. Please refer to the COPYRIGHT
* file distributed with this source distribution.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* $URL$
* $Id$
*
*/
/* Functionality to transform the context-free SCI grammar rules into
* strict Greibach normal form (strict GNF), and to test SCI input against
* that grammar, writing an appropriate node tree if successful.
*/
#include "sci/parser/vocabulary.h"
#include "sci/console.h"
#include "common/array.h"
namespace Sci {
#define TOKEN_OPAREN 0xff000000
#define TOKEN_CPAREN 0xfe000000
#define TOKEN_TERMINAL_CLASS 0x10000
#define TOKEN_TERMINAL_GROUP 0x20000
#define TOKEN_STUFFING_LEAF 0x40000
#define TOKEN_STUFFING_WORD 0x80000
#define TOKEN_NON_NT (TOKEN_OPAREN | TOKEN_TERMINAL_CLASS | TOKEN_TERMINAL_GROUP | TOKEN_STUFFING_LEAF | TOKEN_STUFFING_WORD)
#define TOKEN_TERMINAL (TOKEN_TERMINAL_CLASS | TOKEN_TERMINAL_GROUP)
static int _allocd_rules = 0; // FIXME: Avoid non-const global vars
struct ParseRule {
int _id; /**< non-terminal ID */
uint _firstSpecial; /**< first terminal or non-terminal */
uint _numSpecials; /**< number of terminals and non-terminals */
Common::Array<int> _data; /**< actual data */
~ParseRule() {
assert(_allocd_rules > 0);
--_allocd_rules;
}
// FIXME remove this one again?
bool operator==(const ParseRule &other) const {
return _id == other._id &&
_firstSpecial == other._firstSpecial &&
_numSpecials == other._numSpecials &&
_data == other._data;
}
};
struct ParseRuleList {
int terminal; /**< Terminal character this rule matches against or 0 for a non-terminal rule */
ParseRule *rule;
ParseRuleList *next;
void print() const;
ParseRuleList(ParseRule *r) : rule(r), next(0) {
int term = rule->_data[rule->_firstSpecial];
terminal = ((term & TOKEN_TERMINAL) ? term : 0);
}
~ParseRuleList() {
delete rule;
delete next;
}
};
static void vocab_print_rule(ParseRule *rule) {
int wspace = 0;
if (!rule) {
warning("NULL rule");
return;
}
printf("[%03x] -> ", rule->_id);
if (rule->_data.empty())
printf("e");
for (uint i = 0; i < rule->_data.size(); i++) {
uint token = rule->_data[i];
if (token == TOKEN_OPAREN) {
if (i == rule->_firstSpecial)
printf("_");
printf("(");
wspace = 0;
} else if (token == TOKEN_CPAREN) {
if (i == rule->_firstSpecial)
printf("_");
printf(")");
wspace = 0;
} else {
if (wspace)
printf(" ");
if (i == rule->_firstSpecial)
printf("_");
if (token & TOKEN_TERMINAL_CLASS)
printf("C(%04x)", token & 0xffff);
else if (token & TOKEN_TERMINAL_GROUP)
printf("G(%04x)", token & 0xffff);
else if (token & TOKEN_STUFFING_LEAF)
printf("%03x", token & 0xffff);
else if (token & TOKEN_STUFFING_WORD)
printf("{%03x}", token & 0xffff);
else
printf("[%03x]", token); /* non-terminal */
wspace = 1;
}
if (i == rule->_firstSpecial)
printf("_");
}
printf(" [%d specials]", rule->_numSpecials);
}
static ParseRule *_vdup(ParseRule *a) {
++_allocd_rules;
return new ParseRule(*a);
}
static ParseRule *_vinsert(ParseRule *turkey, ParseRule *stuffing) {
uint firstnt = turkey->_firstSpecial;
// Search for first TOKEN_NON_NT in 'turkey'
while ((firstnt < turkey->_data.size()) && (turkey->_data[firstnt] & TOKEN_NON_NT))
firstnt++;
// If no TOKEN_NON_NT found, or if it doesn't match the id of 'stuffing', abort.
if ((firstnt == turkey->_data.size()) || (turkey->_data[firstnt] != stuffing->_id))
return NULL;
// Create a new rule as a copy of 'turkey', where the token firstnt has been substituted
// by the rule 'stuffing'.
++_allocd_rules;
ParseRule *rule = new ParseRule(*turkey);
rule->_numSpecials += stuffing->_numSpecials - 1;
rule->_firstSpecial = firstnt + stuffing->_firstSpecial;
rule->_data.resize(turkey->_data.size() - 1 + stuffing->_data.size());
// Replace rule->_data[firstnt] by all of stuffing->_data
Common::copy(stuffing->_data.begin(), stuffing->_data.end(), rule->_data.begin() + firstnt);
if (firstnt < turkey->_data.size() - 1)
Common::copy(turkey->_data.begin() + firstnt + 1, turkey->_data.end(),
rule->_data.begin() + firstnt + stuffing->_data.size());
return rule;
}
static ParseRule *_vbuild_rule(const parse_tree_branch_t *branch) {
int tokens = 0, tokenpos = 0, i;
while (tokenpos < 10 && branch->data[tokenpos]) {
int type = branch->data[tokenpos];
tokenpos += 2;
if ((type == VOCAB_TREE_NODE_COMPARE_TYPE) || (type == VOCAB_TREE_NODE_COMPARE_GROUP) || (type == VOCAB_TREE_NODE_FORCE_STORAGE))
++tokens;
else if (type > VOCAB_TREE_NODE_LAST_WORD_STORAGE)
tokens += 5;
else
return NULL; // invalid
}
ParseRule *rule = new ParseRule();
++_allocd_rules;
rule->_id = branch->id;
rule->_numSpecials = tokenpos >> 1;
rule->_data.resize(tokens);
rule->_firstSpecial = 0;
tokens = 0;
for (i = 0; i < tokenpos; i += 2) {
int type = branch->data[i];
int value = branch->data[i + 1];
if (type == VOCAB_TREE_NODE_COMPARE_TYPE)
rule->_data[tokens++] = value | TOKEN_TERMINAL_CLASS;
else if (type == VOCAB_TREE_NODE_COMPARE_GROUP)
rule->_data[tokens++] = value | TOKEN_TERMINAL_GROUP;
else if (type == VOCAB_TREE_NODE_FORCE_STORAGE)
rule->_data[tokens++] = value | TOKEN_STUFFING_WORD;
else { // normal inductive rule
rule->_data[tokens++] = TOKEN_OPAREN;
rule->_data[tokens++] = type | TOKEN_STUFFING_LEAF;
rule->_data[tokens++] = value | TOKEN_STUFFING_LEAF;
if (i == 0)
rule->_firstSpecial = tokens;
rule->_data[tokens++] = value; // The non-terminal
rule->_data[tokens++] = TOKEN_CPAREN;
}
}
return rule;
}
static ParseRule *_vsatisfy_rule(ParseRule *rule, const ResultWordList &input) {
int dep;
if (!rule->_numSpecials)
return NULL;
dep = rule->_data[rule->_firstSpecial];
int count = 0;
int match = 0;
ResultWordList::const_iterator iter;
// TODO: Inserting an array in the middle of another array is slow
Common::Array<int> matches;
matches.reserve(input.size());
// We store the first match in 'match', and any subsequent matches in
// 'matches'. 'match' replaces the special in the rule, and 'matches' gets
// inserted after it.
for (iter = input.begin(); iter != input.end(); ++iter)
if (((dep & TOKEN_TERMINAL_CLASS) && ((dep & 0xffff) & iter->_class)) ||
((dep & TOKEN_TERMINAL_GROUP) && ((dep & 0xffff) & iter->_group))) {
if (count == 0)
match = TOKEN_STUFFING_WORD | iter->_group;
else
matches.push_back(TOKEN_STUFFING_WORD | iter->_group);
count++;
}
if (count) {
ParseRule *retval = new ParseRule(*rule);
++_allocd_rules;
retval->_data[rule->_firstSpecial] = match;
if (count > 1)
retval->_data.insert_at(rule->_firstSpecial+1, matches);
retval->_numSpecials--;
retval->_firstSpecial = 0;
if (retval->_numSpecials) { // find first special, if it exists
for (uint i = rule->_firstSpecial; i < retval->_data.size(); ++i) {
int tmp = retval->_data[i];
if (!(tmp & TOKEN_NON_NT) || (tmp & TOKEN_TERMINAL)) {
retval->_firstSpecial = i;
break;
}
}
}
return retval;
} else
return NULL;
}
void Vocabulary::freeRuleList(ParseRuleList *list) {
delete list;
}
static ParseRuleList *_vocab_add_rule(ParseRuleList *list, ParseRule *rule) {
if (!rule)
return list;
if (!rule->_data.size()) {
// Special case for qfg2 demo
warning("no rule contents on _vocab_add_rule()");
return list;
}
ParseRuleList *new_elem = new ParseRuleList(rule);
if (list) {
const int term = new_elem->terminal;
/* if (term < list->terminal) {
new_elem->next = list;
return new_elem;
} else {*/
ParseRuleList *seeker = list;
while (seeker->next/* && seeker->next->terminal <= term*/) {
if (seeker->next->terminal == term) {
if (*(seeker->next->rule) == *rule) {
delete new_elem; // NB: This also deletes 'rule'
return list; // No duplicate rules
}
}
seeker = seeker->next;
}
new_elem->next = seeker->next;
seeker->next = new_elem;
return list;
} else {
return new_elem;
}
}
void ParseRuleList::print() const {
const ParseRuleList *list = this;
int pos = 0;
while (list) {
printf("R%03d: ", pos);
vocab_print_rule(list->rule);
printf("\n");
list = list->next;
pos++;
}
printf("%d rules total.\n", pos);
}
static ParseRuleList *_vocab_split_rule_list(ParseRuleList *list) {
assert(list);
if (!list->next || (list->next->terminal)) {
ParseRuleList *tmp = list->next;
list->next = NULL;
return tmp;
} else
return _vocab_split_rule_list(list->next);
}
static void _vocab_free_empty_rule_list(ParseRuleList *list) {
assert(list);
if (list->next)
_vocab_free_empty_rule_list(list->next);
list->next = 0;
list->rule = 0;
delete list;
}
static ParseRuleList *_vocab_merge_rule_lists(ParseRuleList *l1, ParseRuleList *l2) {
ParseRuleList *retval = l1, *seeker = l2;
while (seeker) {
retval = _vocab_add_rule(retval, seeker->rule);
seeker = seeker->next;
}
_vocab_free_empty_rule_list(l2);
return retval;
}
static int _vocab_rule_list_length(ParseRuleList *list) {
return ((list) ? _vocab_rule_list_length(list->next) + 1 : 0);
}
static ParseRuleList *_vocab_clone_rule_list_by_id(ParseRuleList *list, int id) {
ParseRuleList *result = NULL;
ParseRuleList *seeker = list;
while (seeker) {
if (seeker->rule->_id == id) {
result = _vocab_add_rule(result, _vdup(seeker->rule));
}
seeker = seeker->next;
}
return result;
}
ParseRuleList *Vocabulary::buildGNF(bool verbose) {
int iterations = 0;
int last_termrules, termrules = 0;
int ntrules_nr;
ParseRuleList *ntlist = NULL;
ParseRuleList *tlist, *new_tlist;
Console *con = g_sci->getSciDebugger();
for (uint i = 1; i < _parserBranches.size(); i++) { // branch rule 0 is treated specially
ParseRule *rule = _vbuild_rule(&_parserBranches[i]);
if (!rule)
return NULL;
ntlist = _vocab_add_rule(ntlist, rule);
}
tlist = _vocab_split_rule_list(ntlist);
ntrules_nr = _vocab_rule_list_length(ntlist);
if (verbose)
con->DebugPrintf("Starting with %d rules\n", ntrules_nr);
new_tlist = tlist;
tlist = NULL;
do {
ParseRuleList *new_new_tlist = NULL;
ParseRuleList *ntseeker, *tseeker;
last_termrules = termrules;
ntseeker = ntlist;
while (ntseeker) {
tseeker = new_tlist;
while (tseeker) {
ParseRule *newrule = _vinsert(ntseeker->rule, tseeker->rule);
if (newrule)
new_new_tlist = _vocab_add_rule(new_new_tlist, newrule);
tseeker = tseeker->next;
}
ntseeker = ntseeker->next;
}
tlist = _vocab_merge_rule_lists(tlist, new_tlist);
new_tlist = new_new_tlist;
termrules = _vocab_rule_list_length(new_new_tlist);
if (verbose)
con->DebugPrintf("After iteration #%d: %d new term rules\n", ++iterations, termrules);
} while (termrules && (iterations < 30));
freeRuleList(ntlist);
if (verbose) {
con->DebugPrintf("\nGNF rules:\n");
tlist->print();
con->DebugPrintf("%d allocd rules\n", _allocd_rules);
con->DebugPrintf("Freeing rule list...\n");
freeRuleList(tlist);
return NULL;
}
return tlist;
}
static int _vbpt_pareno(ParseTreeNode *nodes, int *pos, int base) {
// Opens parentheses
nodes[base].left = &nodes[(*pos) + 1];
nodes[++(*pos)].type = kParseTreeBranchNode;
nodes[*pos].left = 0;
nodes[*pos].right = 0;
return *pos;
}
static int _vbpt_parenc(ParseTreeNode *nodes, int *pos, int paren) {
// Closes parentheses for appending
nodes[paren].right = &nodes[++(*pos)];
nodes[*pos].type = kParseTreeBranchNode;
nodes[*pos].left = 0;
nodes[*pos].right = 0;
return *pos;
}
static int _vbpt_append(ParseTreeNode *nodes, int *pos, int base, int value) {
// writes one value to an existing base node and creates a successor node for writing
nodes[base].left = &nodes[++(*pos)];
nodes[*pos].type = kParseTreeLeafNode;
nodes[*pos].value = value;
nodes[*pos].right = 0;
nodes[base].right = &nodes[++(*pos)];
nodes[*pos].type = kParseTreeBranchNode;
nodes[*pos].left = 0;
nodes[*pos].right = 0;
return *pos;
}
static int _vbpt_terminate(ParseTreeNode *nodes, int *pos, int base, int value) {
// Terminates, overwriting a nextwrite forknode
nodes[base].type = kParseTreeLeafNode;
nodes[base].value = value;
nodes[base].right = 0;
return *pos;
}
static int _vbpt_append_word(ParseTreeNode *nodes, int *pos, int base, int value) {
// writes one value to an existing node and creates a sibling for writing
nodes[base].type = kParseTreeWordNode;
nodes[base].value = value;
nodes[base].right = &nodes[++(*pos)];
nodes[*pos].type = kParseTreeBranchNode;
nodes[*pos].left = 0;
nodes[*pos].right = 0;
return *pos;
}
static int _vbpt_terminate_word(ParseTreeNode *nodes, int *pos, int base, int value) {
// Terminates, overwriting a nextwrite forknode
nodes[base].type = kParseTreeWordNode;
nodes[base].value = value;
nodes[base].right = 0;
return *pos;
}
static int _vbpt_write_subexpression(ParseTreeNode *nodes, int *pos, ParseRule *rule, uint rulepos, int writepos) {
uint token;
while ((token = ((rulepos < rule->_data.size()) ? rule->_data[rulepos++] : TOKEN_CPAREN)) != TOKEN_CPAREN) {
uint nexttoken = (rulepos < rule->_data.size()) ? rule->_data[rulepos] : TOKEN_CPAREN;
if (token == TOKEN_OPAREN) {
int writepos2 = _vbpt_pareno(nodes, pos, writepos);
rulepos = _vbpt_write_subexpression(nodes, pos, rule, rulepos, writepos2);
nexttoken = (rulepos < rule->_data.size()) ? rule->_data[rulepos] : TOKEN_CPAREN;
if (nexttoken != TOKEN_CPAREN)
writepos = _vbpt_parenc(nodes, pos, writepos);
} else if (token & TOKEN_STUFFING_LEAF) {
if (nexttoken == TOKEN_CPAREN)
writepos = _vbpt_terminate(nodes, pos, writepos, token & 0xffff);
else
writepos = _vbpt_append(nodes, pos, writepos, token & 0xffff);
} else if (token & TOKEN_STUFFING_WORD) {
if (nexttoken == TOKEN_CPAREN)
writepos = _vbpt_terminate_word(nodes, pos, writepos, token & 0xffff);
else
writepos = _vbpt_append_word(nodes, pos, writepos, token & 0xffff);
} else {
printf("\nError in parser (grammar.cpp, _vbpt_write_subexpression()): Rule data broken in rule ");
vocab_print_rule(rule);
printf(", at token position %d\n", *pos);
return rulepos;
}
}
return rulepos;
}
int Vocabulary::parseGNF(const ResultWordListList &words, bool verbose) {
Console *con = g_sci->getSciDebugger();
// Get the start rules:
ParseRuleList *work = _vocab_clone_rule_list_by_id(_parserRules, _parserBranches[0].data[1]);
ParseRuleList *results = NULL;
uint word = 0;
const uint words_nr = words.size();
ResultWordListList::const_iterator words_iter;
for (words_iter = words.begin(); words_iter != words.end(); ++words_iter, ++word) {
ParseRuleList *new_work = NULL;
ParseRuleList *reduced_rules = NULL;
ParseRuleList *seeker, *subseeker;
if (verbose)
con->DebugPrintf("Adding word %d...\n", word);
seeker = work;
while (seeker) {
if (seeker->rule->_numSpecials <= (words_nr - word)) {
reduced_rules = _vocab_add_rule(reduced_rules, _vsatisfy_rule(seeker->rule, *words_iter));
}
seeker = seeker->next;
}
if (reduced_rules == NULL) {
freeRuleList(work);
if (verbose)
con->DebugPrintf("No results.\n");
return 1;
}
freeRuleList(work);
if (word + 1 < words_nr) {
seeker = reduced_rules;
while (seeker) {
if (seeker->rule->_numSpecials) {
int my_id = seeker->rule->_data[seeker->rule->_firstSpecial];
subseeker = _parserRules;
while (subseeker) {
if (subseeker->rule->_id == my_id)
new_work = _vocab_add_rule(new_work, _vinsert(seeker->rule, subseeker->rule));
subseeker = subseeker->next;
}
}
seeker = seeker->next;
}
freeRuleList(reduced_rules);
} else // last word
new_work = reduced_rules;
work = new_work;
if (verbose)
con->DebugPrintf("Now at %d candidates\n", _vocab_rule_list_length(work));
if (work == NULL) {
if (verbose)
con->DebugPrintf("No results.\n");
return 1;
}
}
results = work;
if (verbose) {
con->DebugPrintf("All results (excluding the surrounding '(141 %03x' and ')'):\n", _parserBranches[0].id);
results->print();
con->DebugPrintf("\n");
}
// now use the first result
{
int temp, pos;
_parserNodes[0].type = kParseTreeBranchNode;
_parserNodes[0].left = &_parserNodes[1];
_parserNodes[0].right = &_parserNodes[2];
_parserNodes[1].type = kParseTreeLeafNode;
_parserNodes[1].value = 0x141;
_parserNodes[1].right = 0;
_parserNodes[2].type = kParseTreeBranchNode;
_parserNodes[2].left = 0;
_parserNodes[2].right = 0;
pos = 2;
temp = _vbpt_append(_parserNodes, &pos, 2, _parserBranches[0].id);
//_vbpt_write_subexpression(nodes, &pos, results[_vocab_rule_list_length(results)].rule, 0, temp);
_vbpt_write_subexpression(_parserNodes, &pos, results->rule, 0, temp);
}
freeRuleList(results);
return 0;
}
} // End of namespace Sci