#From: reeds@gauss.att.com
#Subject: Voytypo and parser
#Date: Mon, 16 Dec 91 00:39:03 EST
#To: jim@rand.org
#
#Moved by a spirit of emulation, may I offer these gems?  Collate
#does not help much.  I found that voytypo on a transcription can
#find some egregious blunders.  I have found parser.c useful for
#several things now; it replaces an earlier 'mkskel.c' of mine.

# To unbundle, sh this file
echo collate 1>&2
sed 's/.//' >collate <<'//GO.SYSIN DD collate'
-#!/bin/sh
-
-PATH=/usr/reeds/voy/bin:$PATH
-T=/tmp/voy$$
-cleanup(){rm $T.1 $T.2}
-trap cleanup 0
-
-parser -w < $1 > $T.1
-parser -w < $2 > $T.2
-
-diff $T.1 $T.2
//GO.SYSIN DD collate
echo voytypo 1>&2
sed 's/.//' >voytypo <<'//GO.SYSIN DD voytypo'
-#!/bin/sh
-
-PATH=/usr/reeds/voy/bin:$PATH
-CORPUS=/usr/reeds/voy/data/voynich.now
-SALT=/usr/reeds/voy/data/salt
-T=/tmp/voy$$
-cleanup(){rm $T}
-trap cleanup 0
-
-cat $* | parser -w|sort -u  > $T
-vtypo -x $SALT $T | awk '{if ($1>10 || $2>10 || $3>10) print $0}' 
-
-# to prepare a fresh salt file:
-#cat $* | parser -w > $T; vtypo -s $T -x $SALT
//GO.SYSIN DD voytypo
echo src/vtypo.c 1>&2
sed 's/.//' >src/vtypo.c <<'//GO.SYSIN DD src/vtypo.c'
-#include <stdio.h>
-#include <math.h>
-extern void exit();
-
-/*
- * A modern recoding of Cherry and Morris 'typo' program
- * Jim Reeds Dec 1991
- */
-
-#define A 37
-char map[256];
-
-char *csets[] = {
-	" 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ",
-	(char*)0
-};
-
-long	f2[A][A];
-long	f3[A][A][A];
-double	wt[A][A][A];
-
-/*
- * typo  [-x saltfile] [-s trainingdoc]... doc...
- */
-main(ac, av)
-char *av[];
-{
-	char *s;
-	int i;
-
-	for(i=0; csets[i]; i++)
-		for(s=csets[i];*s;s++)
-			map[*s] = s-csets[i];
-		
-	for(i=1; i<ac; i++){
-		if(strcmp(av[i], "-s")==0) {
-			getstats(av[++i]);
-		}
-		else
-		if(strcmp(av[i], "-x")==0) {
-			salt(av[++i]);
-		}
-		else if(av[i][0] == '-') {
-				err("usage: typo [-s data] file...");
-		}
-		else {
-			getstats(av[i]);
-			mkwts();
-			finderrs(av[i]);
-		}
-	}
-	return 0;
-}
-
-/*
- * hack: if it doesnt exist, create it & die
- * if it does, read it & run on
- */
-salt(s)
-char *s;
-{
-	int fd;
-
-	fd = open(s, 0);
-	if(fd != -1 && read(fd, (char*)wt, sizeof(wt))== sizeof(wt)) 
-		return;
-	fd = creat(s, 0644);
-	mkwts();
-	write(fd, (char*)wt, sizeof(wt));
-	printf("%s ready\n", s);
-	exit(0);
-}
-
-err(f)
-char *f;
-{
-	perror(f);
-	exit(1);
-}
-
-
-getstats(f)
-char *f;
-{
-	int a, b, c;
-	FILE *fp;
-	
-	fp = fopen(f,"r");
-
-	if(fp == NULL) err(f);
-
-	a = b = 0;
-	while((c=getc(fp))!=EOF){
-		c = map[c];
-		if(b||c){
-			f2[b][c]++;
-			if(b) f3[a][b][c]++;	/* dont span word breaks */
-		}
-		a = b;
-		b = c;
-	}
-	fclose(fp);
-}
-
-mkwts()
-{
-	double sum;
-	int i, j, k;
-
-	for(i=0; i<A; i++)
-	for(j=0; j<A; j++)
-	for(k=0; k<A; k++) {
-		sum = 0;
-		if(f2[i][j]>1) sum += log(f2[i][j]-1.); else sum -= 10;
-		if(f2[j][k]>1) sum += log(f2[j][k]-1.); else sum -= 10;
-		sum /= 2;
-		if(f3[i][j][k]>1) sum -= log(f3[i][j][k]-1.); else sum += 10;
-		wt[i][j][k] = sum;
-	}
-}
-
-finderrs(f)
-char *f;
-{
-	char *wp, wbuf[100];
-	double w, wbig[3];
-	int n, a, b, c;
-	FILE *fp, *tp;
-
-	fp = fopen(f, "r");
-	if(fp == NULL)err(f);
-
-	tp = popen("sort -rnu", "w");
-	if(tp == NULL)err("pipe to sort");
-
-	wp = wbuf;
-	n = a = b = 0;
-	wbig[0] = wbig[1] = wbig[2] = 0;
-	while((c=getc(fp))!=EOF) {
-		*wp++ = c;
-		c = map[c];
-		if(b != 0) {
-			w = wt[a][b][c];
-			n++;
-			wbig[0] += w;
-			wbig[1] += w*w;
-			if(w>wbig[2])wbig[2] = w;
-		}
-		if(c == 0) {
-			if(n>0){
-				wbig[0] /= n;
-				wbig[1] = sqrt(wbig[1]/n);
-				wp[-1] = 0;
-				fprintf(tp, "%g	%g	%g	%s\n", 
-					wbig[0], wbig[1], wbig[2], wbuf);
-			}
-			wp = wbuf;
-			n = a = b = 0;
-			wbig[0] = wbig[1] = wbig[2] = 0;
-		}
-		a = b;
-		b = c;
-	}
-
-	pclose(tp);
-	fclose(fp);
-}
//GO.SYSIN DD src/vtypo.c
echo src/parser.c 1>&2
sed 's/.//' >src/parser.c <<'//GO.SYSIN DD src/parser.c'
-#include <ctype.h>
-#include <stdio.h>
-
-extern char *strncpy();
-extern void exit();
-
-/*
- * Non alphanums that may occur in a Currier transcription line:
- */
-char cset[] = " \t[]()|.,*-#";
-char currier[256];
-#define iscurrier(x)(currier[x])
-
-int lineno;
-
-char buffer[4096], *bp, *bend = &buffer[4096];
-
-/*
- * parser for the new transcription format
- */
-struct handler {
-	int	(*blank)();
-	int	(*locus)();
-	int	(*comment)();
-	int	(*currier)();
-};
-struct handler justparse;	/* default */
-struct handler skeleton;	/* -s	strip away currier */
-struct handler postscript;
-struct handler troff;
-struct handler wordspit;	/* -w	reduce to a stream of words */
-
-main(ac, av)
-char **av;
-{
-	struct handler *fp = &justparse;
-	char *s;
-	int c, state;
-
-	/*
-	 * Currier set: original 36 plus:
-	 *    lower case for expansion
-	 *    [|] for alternation
-	 *    . / .. * # , space
-	 */
-	for(c=0;c<256;c++)
-		currier[c] = isalnum(c);
-
-	for(s=cset;  c=*s; s++)
-		currier[c] = 1;
-	
-	if(ac>1 && strcmp(av[1],"-s")==0) fp = &skeleton;
-	if(ac>1 && strcmp(av[1],"-p")==0) fp = &postscript;
-	if(ac>1 && strcmp(av[1],"-t")==0) fp = &troff;
-	if(ac>1 && strcmp(av[1],"-w")==0) fp = &wordspit;
-
-	state = 0;
-	lineno = 1;
-	
-	while((c=getchar())!=EOF){
-		if(isspace(c)){
-			if(c=='\n'){
-				state = 0;
-				lineno++;
-			}
-			(fp->blank)(c);
-		}
-		else if(c=='{'){
-			dobalanced('}');
-			(fp->comment)();
-		}
-		else if(c=='<'){
-			state = 1;
-			dobalanced('>');
-			(fp->locus)();
-		}
-		else if(iscurrier(c)){
-			if(!state)
-				syntax("Currier data without locus\n");
-			docurrier();
-			(fp->currier)();
-		}
-		else syntax("bad char %c\n", c);
-	}
-}
-
-
-/*
- * handlers if just parsing
- */
-jblank(c){
-	printf("%c", c);
-}
-jlocus(){
-	printf("<%s> ", buffer);
-}
-jcomment(){
-	printf("{%s}", buffer);
-}
-jcurrier(){
-	printf("%s", buffer);
-}
-struct handler justparse = {jblank, jlocus, jcomment, jcurrier};;
-
-/*
- * wordspit
- */
-wnil(){
-	;
-}
-wcurrier(){
-	char c, *s;
-	for(s=buffer;c=*s;s++){
-		switch(c){
-		case ' ':
-		case '\t':
-		case '-':
-		case '#':
-		case '.':
-			Freshline();
-			break;
-		default:
-			Put(c);
-			break;
-		}
-	}
-	Freshline();
-}
-struct handler wordspit = {wnil, wnil, wnil, wcurrier};;
-
-
-/*
- * skeleton stripping:
- * toss all currier data and all comments up to next locus
- */
-int mayprint = 1;
-sblank(c){
-	Put(c);
-}
-slocus(){
-	mayprint = 1;
-	Freshline();
-	Put('<');
-	Puts(buffer);
-	Put('>');
-}
-scomment(){
-	if(mayprint){
-		Put('{');
-		Puts(buffer);
-		Put('}');
-	}
-}
-scurrier(){
-	mayprint = 0;
-}
-struct handler skeleton = {sblank, slocus, scomment, scurrier};;
-
-
-/*
- * handlers for postscript
- */
-pblank(){
-	;
-}
-plocus(){
-	printf("(%s) locus\n", buffer);
-}
-pcomment(){
-	printf("(%s) comment\n", buffer);
-}
-pcurrier(){
-	printf("(%s) currier\n", buffer);
-}
-struct handler postscript = {pblank, plocus, pcomment, pcurrier};;
-
-/*
- * handlers for troff
- */
-int paraskip = 0;
-tblank(){
-	;
-}
-tlocus(){
-	Freshline();
-	if(paraskip)
-		Put('\n');
-	Puts(".vL ");
-	Puts(buffer);
-	Put('\n');
-	paraskip = 0;
-}
-tcomment(){
-	Freshline();
-	Puts(".vC\n");
-	Puts(buffer);
-}
-tcurrier(){
-	int c;
-	char *s;
-	
-	Freshline();
-	Puts(".vX\n");
-	for(s=buffer; c = *s; s++){
-		if(c=='-'&&s[1]==0){
-			c = 0;
-		}else
-		if(c=='-'&&s[1]!=0){
-			Puts("\\*(vG");
-			continue;
-		}else
-		if(c=='.'&&s[1]=='.'){
-			Puts("\\*(vG");
-			s++;
-			continue;
-		}else
-		if(islower(c) || c == '[' || c== '|' || c== ']'){
-			Puts("\\f(CW");
-			Put(c);
-			Puts("\\fP");
-			continue;
-		}
-		else if(c=='#') {
-			c = 0;
-			paraskip = 1;
-		}
-		else if(c=='.'|| c=='/') c = ' ';
-		if(c)Put(c);
-	}
-}
-struct handler troff = {tblank, tlocus, tcomment, tcurrier};;
-
-
-/*
- * parsers
- */
-dobalanced(endbrak){
-	int c;
-	bp = buffer;
-	while((c=getchar())!=EOF){
-		if(c==endbrak){
-			bput(0);
-			return;
-		} else
-			bput(c);
-	}
-	bput(0);
-	syntax("expecting %c\n", endbrak);
-}
-docurrier(){
-	int c;
-	bp = buffer;
-	while((c=getchar())!=EOF){
-		switch(c){
-		case '\n':
-		case '{':
-		case '<':
-			ungetc(c,stdin);
-			bput(0);
-			return;
-			break;
-		case '[':
-			doalternate(); 
-			break;
-		case '.':
-		case ',':	
-			bput(c);
-			break;
-		default:
-			if(isspace(c)||iscurrier(c))
-				bput(c);
-			else syntax("%c not legal Currier\n", c);
-			break;
-		}
-	}
-	bput(0);
-}
-doalternate(){
-	int c;
-	bput('[');
-	while((c=getchar())!=EOF){
-		switch(c){
-		case ']':
-			bput(c);
-			return;
-			break;
-		case '.':
-		case ',':	
-		case '|':	
-			bput(c);
-			break;
-		default:
-			if(isspace(c)||iscurrier(c))
-				bput(c);
-			else syntax("%c not legal in alternate\n", c);
-			break;
-		}
-	}
-	syntax("expecting ] to end\n",0);
-}
-/*
- * utilities
- */
-
-bput(c)
-{
-	if(bp<bend)
-		*bp++ = c;
-	else
-		error("buffer overrun", c);
-}
-error(s, t)
-char *s;
-{
-	fprintf(stderr,"%s %c\n", s, t);
-	exit(1);
-}
-
-syntax(s,t)
-char *s;
-{
-	fprintf(stderr, "syntax error line %d: ", lineno);
-	fprintf(stderr, s, t);
-}
-
-/*
- * use these when you want to be persnicketty about newlines
- */
-char lastout;
-Put(c)
-{
-	lastout = c;
-	putchar(c);
-}
-Freshline()
-{
-	if(lastout!='\n')
-		Put('\n');
-}
-Puts(s)
-char *s;
-{
-	int c;
-	while(c = *s++)Put(c);
-}
//GO.SYSIN DD src/parser.c

