/**************************************************************************
 grammar.c Copyright (C) 2000 Christoph Reichenbach


 This program may be modified and copied freely according to the terms of
 the GNU general public license (GPL), as long as the above copyright
 notice and the licensing information contained herein are preserved.

 Please refer to www.gnu.org for licensing details.

 This work is provided AS IS, without warranty of any kind, expressed or
 implied, including but not limited to the warranties of merchantibility,
 noninfringement, and fitness for a specific purpose. The author will not
 be held liable for any damage caused by this work or derivatives of it.

 By using this source code, you agree to the licensing terms as stated
 above.

***************************************************************************/

/* Functionality to transform the context-free SCI grammar rules into
** strict Greibach normal form (strict GNF), and to test SCI input against
** that grammar, writing an appropriate node tree if successful.
*/

#include "sci/include/resource.h"
#include "sci/include/vocabulary.h"
#include "sci/include/console.h"
#include <stdarg.h>
#include <string.h>

#define TOKEN_OPAREN 0xff000000
#define TOKEN_CPAREN 0xfe000000
#define TOKEN_TERMINAL_CLASS 0x10000
#define TOKEN_TERMINAL_GROUP 0x20000
#define TOKEN_STUFFING_WORD 0x40000
#define TOKEN_NON_NT (TOKEN_OPAREN | TOKEN_TERMINAL_CLASS | TOKEN_TERMINAL_GROUP | TOKEN_STUFFING_WORD)
#define TOKEN_TERMINAL (TOKEN_TERMINAL_CLASS | TOKEN_TERMINAL_GROUP)

 int _allocd_rules = 0;

static void
vocab_print_rule(parse_rule_t *rule)
{
  int i;
  int wspace = 0;

  if (!rule) {
    sciprintf("NULL rule");
    return;
  }

  sciprintf("[%03x] -> ", rule->id);

  if (!rule->length)
    sciprintf("e");

  for(i = 0; i < rule->length; i++) {
    uint token = rule->data[i];

    if (token == TOKEN_OPAREN) {

      if (i == rule->first_special)
	sciprintf("_");

      sciprintf("(");
      wspace = 0;
    } else if (token == TOKEN_CPAREN) {

      if (i == rule->first_special)
	sciprintf("_");

      sciprintf(")");
      wspace = 0;
    } else {
      if (wspace)
	sciprintf(" ");

      if (i == rule->first_special)
	sciprintf("_");
      if (token & TOKEN_TERMINAL_CLASS)
	sciprintf("C(%04x)", token & 0xffff);
      else if (token & TOKEN_TERMINAL_GROUP)
	sciprintf("G(%04x)", token & 0xffff);
      else if (token & TOKEN_STUFFING_WORD)
	sciprintf("%03x", token & 0xffff);
      else
	sciprintf("[%03x]", token); /* non-terminal */
      wspace = 1;
    }

    if (i == rule->first_special)
      sciprintf("_");
  }
  sciprintf(" [%d specials]", rule->specials_nr);
}


static void
_vfree(parse_rule_t *rule)
{
  free(rule);
  --_allocd_rules;
  rule = NULL;
}

static parse_rule_t *
_vdup(parse_rule_t *a)
{
  parse_rule_t *rule = (parse_rule_t*)sci_malloc(sizeof(int) * (a->length + 4));

  rule->id = a->id;
  rule->length = a->length;
  rule->specials_nr = a->specials_nr;
  rule->first_special = a->first_special;
  ++_allocd_rules;

  memcpy(rule->data, a->data, sizeof(int) * a->length);

  return rule;
}

static parse_rule_t *
_vinsert(parse_rule_t *turkey, parse_rule_t *stuffing)
{
  int firstnt = turkey->first_special;
  parse_rule_t *rule;

  while ((firstnt < turkey->length)
	 && (turkey->data[firstnt] & TOKEN_NON_NT))
    firstnt++;

  if ((firstnt == turkey->length)
      || (turkey->data[firstnt] != stuffing->id))
    return NULL;

  rule = (parse_rule_t*)sci_malloc(sizeof(int) * (turkey->length - 1 + stuffing->length + 4));
  rule->id = turkey->id;
  rule->specials_nr = turkey->specials_nr + stuffing->specials_nr - 1;
  rule->first_special = firstnt + stuffing->first_special;
  rule->length = turkey->length - 1 + stuffing->length;
  ++_allocd_rules;

  if (firstnt > 0)
    memcpy(rule->data, turkey->data, sizeof(int) * firstnt);
  memcpy(&(rule->data[firstnt]), stuffing->data, sizeof(int) * stuffing->length);
  if (firstnt < turkey->length - 1)
    memcpy(&(rule->data[firstnt + stuffing->length]), &(turkey->data[firstnt + 1]),
	   sizeof(int) * (turkey->length - firstnt - 1));

  return rule;
}

static parse_rule_t *
_vbuild_rule(parse_tree_branch_t *branch)
{
  parse_rule_t *rule;
  int tokens = 0, tokenpos = 0, i;

  while (tokenpos < 10 && branch->data[tokenpos]) {
    int type = branch->data[tokenpos];
    tokenpos += 2;

    if ((type == VOCAB_TREE_NODE_COMPARE_TYPE)
	|| (type == VOCAB_TREE_NODE_COMPARE_GROUP)
	|| (type == VOCAB_TREE_NODE_FORCE_STORAGE))
      ++tokens;
    else if (type > VOCAB_TREE_NODE_LAST_WORD_STORAGE)
      tokens += 5;
    else return NULL; /* invalid */
  }

  rule = (parse_rule_t*)sci_malloc(sizeof(int) * (4 + tokens));

  ++_allocd_rules;
  rule->id = branch->id;
  rule->specials_nr = tokenpos >> 1;
  rule->length = tokens;
  rule->first_special = 0;

  tokens = 0;
  for (i = 0; i < tokenpos; i += 2) {
    int type = branch->data[i];
    int value = branch->data[i + 1];

    if (type == VOCAB_TREE_NODE_COMPARE_TYPE)
      rule->data[tokens++] = value | TOKEN_TERMINAL_CLASS;
    else if (type == VOCAB_TREE_NODE_COMPARE_GROUP)
      rule->data[tokens++] = value | TOKEN_TERMINAL_GROUP;
    else if (type == VOCAB_TREE_NODE_FORCE_STORAGE)
      rule->data[tokens++] = value | TOKEN_STUFFING_WORD;
    else { /* normal inductive rule */
      rule->data[tokens++] = TOKEN_OPAREN;
      rule->data[tokens++] = type | TOKEN_STUFFING_WORD;
      rule->data[tokens++] = value | TOKEN_STUFFING_WORD;

      if (i == 0)
	rule->first_special = tokens;

      rule->data[tokens++] = value; /* The non-terminal */
      rule->data[tokens++] = TOKEN_CPAREN;
    }
  }

  return rule;
}


static parse_rule_t *
_vsatisfy_rule(parse_rule_t *rule, result_word_t *input)
{
  int dep;

  if (!rule->specials_nr)
    return NULL;

  dep = rule->data[rule->first_special];

  if (((dep & TOKEN_TERMINAL_CLASS)
       && ((dep & 0xffff) & input->w_class))
      ||
      ((dep & TOKEN_TERMINAL_GROUP)
       && ((dep & 0xffff) & input->group))) {
    parse_rule_t *retval = (parse_rule_t*)sci_malloc(sizeof(int) * (4 + rule->length));
    ++_allocd_rules;
    retval->id = rule->id;
    retval->specials_nr = rule->specials_nr - 1;
    retval->length = rule->length;
    memcpy(retval->data, rule->data, sizeof(int) * retval->length);
    retval->data[rule->first_special] = TOKEN_STUFFING_WORD | input->group;
    retval->first_special = 0;

    if (retval->specials_nr) { /* find first special, if it exists */
      int tmp, i = rule->first_special;

      while ((i < rule->length)
	     && ((tmp = retval->data[i]) & TOKEN_NON_NT)
	     && !(tmp & TOKEN_TERMINAL))
	++i;

      if (i < rule->length)
	retval->first_special = i;
    }

    return retval;
  }
  else return NULL;
}

/************** Rule lists **************/

void
vocab_free_rule_list(parse_rule_list_t *list)
{
	if (list) {
		_vfree(list->rule);
		vocab_free_rule_list(list->next); /* Yep, this is slow and memory-intensive. */
		free(list);
	}
}

static inline int
_rules_equal_p(parse_rule_t *r1, parse_rule_t *r2)
{
	if ((r1->id != r2->id)
	    || (r1->length != r2->length)
	    || (r1->first_special != r2->first_special))
		return 0;

	return !(memcmp(r1->data, r2->data, sizeof(int) * r1->length));
}

static parse_rule_list_t *
_vocab_add_rule(parse_rule_list_t *list, parse_rule_t *rule)
{
	parse_rule_list_t *new_elem;
	int term;

	if (!rule)
		return list;

	new_elem = (parse_rule_list_t*)sci_malloc(sizeof(parse_rule_list_t));
	term = rule->data[rule->first_special];

	new_elem->rule = rule;
	new_elem->next = NULL;
	new_elem->terminal = term = ((term & TOKEN_TERMINAL)? term : 0);

	if (!list)
		return new_elem;
	else/* if (term < list->terminal) {
	       new_elem->next = list;
	       return new_elem;
	       } else*/ {
		parse_rule_list_t *seeker = list;

		while (seeker->next/* && seeker->next->terminal <= term*/) {
			if (seeker->next->terminal == term)
				if (_rules_equal_p(seeker->next->rule, rule)) {
					_vfree(rule);
					free(new_elem);
					return list; /* No duplicate rules */
				}
			seeker = seeker->next;
		}

		new_elem->next = seeker->next;
		seeker->next = new_elem;
		return list;
	}
}

static void
_vprl(parse_rule_list_t *list, int pos)
{
	if (list) {
		sciprintf("R%03d: ", pos);
		vocab_print_rule(list->rule);
		sciprintf("\n");
		_vprl(list->next, pos+1);
	} else {
		sciprintf("%d rules total.\n", pos);
	}
}

void
vocab_print_rule_list(parse_rule_list_t *list)
{
	_vprl(list, 0);
}

static parse_rule_list_t *
_vocab_split_rule_list(parse_rule_list_t *list)
{
	if (!list->next
	    || (list->next->terminal)) {
		parse_rule_list_t *tmp = list->next;
		list->next = NULL;
		return tmp;
	}
	else return _vocab_split_rule_list(list->next);
}

static void
_vocab_free_empty_rule_list(parse_rule_list_t *list)
{
	if (list->next)
		_vocab_free_empty_rule_list(list->next);

	free(list);
}

static parse_rule_list_t *
_vocab_merge_rule_lists(parse_rule_list_t *l1, parse_rule_list_t *l2)
{
	parse_rule_list_t *retval = l1, *seeker = l2;
	while (seeker) {
		retval = _vocab_add_rule(retval, seeker->rule);
		seeker = seeker->next;
	}
	_vocab_free_empty_rule_list(l2);

	return retval;
}

static int
_vocab_rule_list_length(parse_rule_list_t *list)
{
	return ((list)? _vocab_rule_list_length(list->next) + 1 : 0);
}


static parse_rule_list_t *
_vocab_clone_rule_list_by_id(parse_rule_list_t *list, int id)
{
	parse_rule_list_t *result = NULL;
	parse_rule_list_t *seeker = list;

	while (seeker) {
		if (seeker->rule->id == id) {
			result = _vocab_add_rule(result, _vdup(seeker->rule));
		}
		seeker = seeker->next;
	}

	return result;
}


parse_rule_list_t *
_vocab_build_gnf(parse_tree_branch_t *branches, int branches_nr, int verbose)
{
	int i;
	int iterations = 0;
	int last_termrules, termrules = 0;
	int ntrules_nr;
	parse_rule_list_t *ntlist = NULL;
	parse_rule_list_t *tlist, *new_tlist;

	for (i = 1; i < branches_nr; i++) { /* branch rule 0 is treated specially */
		parse_rule_t *rule = _vbuild_rule(branches + i);

		if (!rule) return NULL;
		ntlist = _vocab_add_rule(ntlist, rule);
	}

	tlist = _vocab_split_rule_list(ntlist);
	ntrules_nr = _vocab_rule_list_length(ntlist);

	if (verbose)
		sciprintf("Starting with %d rules\n", ntrules_nr);

	new_tlist = tlist;
	tlist = NULL;

	do {
		parse_rule_list_t *new_new_tlist = NULL;
		parse_rule_list_t *ntseeker, *tseeker;
		last_termrules = termrules;

		ntseeker = ntlist;
		while (ntseeker) {
			tseeker = new_tlist;

			while (tseeker) {
				parse_rule_t *newrule = _vinsert(ntseeker->rule, tseeker->rule);
				if (newrule)
					new_new_tlist = _vocab_add_rule(new_new_tlist, newrule);
				tseeker = tseeker->next;
			}

			ntseeker = ntseeker->next;
		}

		tlist = _vocab_merge_rule_lists(tlist, new_tlist);

		new_tlist = new_new_tlist;

		termrules = _vocab_rule_list_length(new_new_tlist);

		if (verbose)
			sciprintf("After iteration #%d: %d new term rules\n", ++iterations, termrules);
	} while (termrules && (iterations < 30));

	vocab_free_rule_list(ntlist);

	if (verbose) {
		sciprintf("\nGNF rules:\n");
		vocab_print_rule_list(tlist);
	}

	return tlist;
}

parse_rule_list_t *
vocab_build_gnf(parse_tree_branch_t *branches, int branches_nr)
{
  return _vocab_build_gnf(branches, branches_nr, 0);
}


void
vocab_gnf_dump(parse_tree_branch_t *branches, int branches_nr)
{
  parse_rule_list_t *tlist = _vocab_build_gnf(branches, branches_nr, 1);

  sciprintf("%d allocd rules\n", _allocd_rules);
  vocab_free_rule_list(tlist);
}


int
vocab_build_parse_tree(parse_tree_node_t *nodes, result_word_t *words, int words_nr,
		       parse_tree_branch_t *branch0, parse_rule_list_t *rules)
{
  return vocab_gnf_parse(nodes, words, words_nr, branch0, rules, 0);
}


static int
_vbpt_pareno(parse_tree_node_t *nodes, int *pos, int base)
/* Opens parentheses */
{
  nodes[base].content.branches[0] = (*pos)+1;
  nodes[++(*pos)].type = PARSE_TREE_NODE_BRANCH;
  nodes[*pos].content.branches[0] = 0;
  nodes[*pos].content.branches[1] = 0;
  return *pos;
}


static int
_vbpt_parenc(parse_tree_node_t *nodes, int *pos, int paren)
/* Closes parentheses for appending */
{
  nodes[paren].content.branches[1] = ++(*pos);
  nodes[*pos].type = PARSE_TREE_NODE_BRANCH;
  nodes[*pos].content.branches[0] = 0;
  nodes[*pos].content.branches[1] = 0;
  return *pos;
}


static int
_vbpt_append(parse_tree_node_t *nodes, int *pos, int base, int value)
/* writes one value to an existing base node and creates a successor node for writing */
{
  nodes[base].content.branches[0] = ++(*pos);
  nodes[*pos].type = PARSE_TREE_NODE_LEAF;
  nodes[*pos].content.value = value;
  nodes[base].content.branches[1] = ++(*pos);
  nodes[*pos].type = PARSE_TREE_NODE_BRANCH;
  nodes[*pos].content.branches[0] = 0;
  nodes[*pos].content.branches[1] = 0;
  return *pos;
}


static int
_vbpt_terminate(parse_tree_node_t *nodes, int *pos, int base, int value)
     /* Terminates, overwriting a nextwrite forknode */
{
  nodes[base].type = PARSE_TREE_NODE_LEAF;
  nodes[base].content.value = value;
  return *pos;
}

static int
_vbpt_write_subexpression(parse_tree_node_t *nodes, int *pos,
			  parse_rule_t *rule, int rulepos, int writepos)
{
  uint token;
  while ((token = ((rulepos < rule->length)? rule->data[rulepos++] : TOKEN_CPAREN)) != TOKEN_CPAREN) {
    uint nexttoken = (rulepos < rule->length)? rule->data[rulepos] : TOKEN_CPAREN;
    if (token == TOKEN_OPAREN) {
      int wpold;
      int writepos2 = _vbpt_pareno(nodes, pos, wpold = writepos);
      rulepos = _vbpt_write_subexpression(nodes, pos, rule, rulepos, writepos2);
      nexttoken = (rulepos < rule->length)? rule->data[rulepos] : TOKEN_CPAREN;
      if (nexttoken != TOKEN_CPAREN)
	writepos = _vbpt_parenc(nodes, pos, wpold);
    } else if (token & TOKEN_STUFFING_WORD) {
      if (nexttoken == TOKEN_CPAREN)
	writepos = _vbpt_terminate(nodes, pos, writepos, token & 0xffff);
      else
	writepos = _vbpt_append(nodes, pos, writepos, token & 0xffff);
    } else {
      sciprintf("\nError in parser (grammar.c, _vbpt_write_subexpression()): Rule data broken in rule ");
      vocab_print_rule(rule);
      sciprintf(", at token position %d\n", *pos);
      return rulepos;
    }
  }

  return rulepos;
}

int
vocab_gnf_parse(parse_tree_node_t *nodes, result_word_t *words, int words_nr,
		parse_tree_branch_t *branch0, parse_rule_list_t *tlist, int verbose)
{
  /* Get the start rules: */
  parse_rule_list_t *work = _vocab_clone_rule_list_by_id(tlist, branch0->data[1]);
  parse_rule_list_t *results = NULL;
  int word;

  for (word = 0; word < words_nr; word++) {
    parse_rule_list_t *new_work = NULL;
    parse_rule_list_t *reduced_rules = NULL;
    parse_rule_list_t *seeker, *subseeker;

    if (verbose)
      sciprintf("Adding word %d...\n", word);

    seeker = work;
    while (seeker) {

      if (seeker->rule->specials_nr <= (words_nr - word))
	reduced_rules = _vocab_add_rule(reduced_rules, _vsatisfy_rule(seeker->rule, words + word));

      seeker = seeker->next;
    }

    if (reduced_rules == NULL) {
      vocab_free_rule_list(work);
      if (verbose)
	sciprintf("No results.\n");
      return 1;
    }

    vocab_free_rule_list(work);

    if (word +1 < words_nr) {
      seeker = reduced_rules;

      while (seeker) {
	if (seeker->rule->specials_nr) {
	  int my_id = seeker->rule->data[seeker->rule->first_special];

	  subseeker = tlist;
	  while (subseeker) {
	    if (subseeker->rule->id == my_id)
	      new_work = _vocab_add_rule(new_work, _vinsert(seeker->rule, subseeker->rule));

	    subseeker = subseeker->next;
	  }
	}

	seeker = seeker->next;
      }
      vocab_free_rule_list(reduced_rules);
    } else /* last word */
      new_work = reduced_rules;

    work = new_work;
    if (verbose)
      sciprintf("Now at %d candidates\n", _vocab_rule_list_length(work));
    if (work == NULL) {
      if (verbose)
	sciprintf("No results.\n");
      return 1;
    }
  }

  results = work;

  if (verbose) {
    sciprintf("All results (excluding the surrounding '(141 %03x' and ')'):\n",
	      branch0->id);
    vocab_print_rule_list(results);
    sciprintf("\n");
  }

  /* now use the first result */
  {
    int temp, pos;

    nodes[0].type = PARSE_TREE_NODE_BRANCH;
    nodes[0].content.branches[0] = 1;
    nodes[0].content.branches[1] = 2;

    nodes[1].type = PARSE_TREE_NODE_LEAF;
    nodes[1].content.value = 0x141;

    nodes[2].type = PARSE_TREE_NODE_BRANCH;
    nodes[2].content.branches[0] = 0;
    nodes[2].content.branches[1] = 0;

    pos = 2;

    temp = _vbpt_append(nodes, &pos, 2, branch0->id);
    /*    _vbpt_write_subexpression(nodes, &pos, results[_vocab_rule_list_length(results)].rule, 0, temp); */
    _vbpt_write_subexpression(nodes, &pos, results->rule, 0, temp);
  }

  vocab_free_rule_list(results);
  return 0;
}