BPFK Section: PEG Morphology Algorithm

From Lojban
Jump to navigation Jump to search
; This is a Parsing Expression Grammar for the morphology of Lojban.%%%
; See [http://www.pdos.lcs.mit.edu/~baford/packrat/] %%%
; %%%
; All rules have the form %%%
; %%%
; 	name <- peg-expression%%%
; %%%
; which means that the grammatical construct "name" is parsed using%%%
; "peg-expression".  %%%
; %%%
; 1)  Concatenation is expressed by juxtaposition with no operator symbol.%%%
; 2)  / represents *ORDERED* alternation (choice).  If the first%%%
;     option succeeds, the others will never be checked.%%%
; 3)  ? indicates that the element to the left is optional.%%%
; 4)  * represents optional repetition of the construct to the left.%%%
; 5)  + represents one-or-more repetition of the construct to the left.%%%
; 6)  () serves to indicate the grouping of the other operators.%%%
; 7)  & indicates that the element to the right must follow (but the%%%
;     marked element itself does not absorb anything).%%%
; 8)  ! indicates that the element to the right must not follow (the%%%
;     marked element itself does not absorb anything).%%%
; 9)  . represents any character.%%%
; 10) ' ' or " " represents a literal string.%%%
; 11) [] represents a character class.   %%% 
;%%%
; Repetitions grab as much as they can.%%%
;%%%
;%%%
; --- GRAMMAR ---%%%
; This grammar classifies words by their morphological class (cmene,%%%
; gismu, lujvo, fuhivla, cmavo, and non-lojban-word). %%%
; %%%
;The final section sorts cmavo into grammatical classes (A, BAI, BAhE, ..., ZOhU).%%%
;%%%
; mi'e ((xorxes))%%%

;-------------------------------------------------------------------%%%

words <- pause? (word pause?)*

word <- lojban-word / non-lojban-word

lojban-word <- cmene / cmavo / brivla

brivla <- gismu / fuhivla / lujvo

;-------------------------------------------------------------------

cmene <- jbocme / zifcme

zifcme <- !h (nucleus / glide / h / consonant !pause / digit)* consonant &pause

jbocme <- &zifcme (any-syllable / digit)* &pause 

;cmene <- !h &consonant-final coda? (any-syllable / digit)* &pause

;consonant-final <- (non-space &non-space)* consonant &pause

;cmene <- !h cmene-syllable* &consonant coda? consonantal-syllable* onset &pause

;cmene-syllable <- !doi-la-lai-lahi coda? consonantal-syllable* onset nucleus / digit

;doi-la-lai-lahi <- (d o i / l a (h? i)?) !h !nucleus

;-------------------------------------------------------------------

cmavo <- !cmene !CVCy-lujvo cmavo-form &post-word 

CVCy-lujvo <- CVC-rafsi y h? initial-rafsi* brivla-core / stressed-CVC-rafsi y short-final-rafsi

cmavo-form <- !h !cluster onset (nucleus h)* (!stressed nucleus / nucleus !cluster) / y+ / digit

;-------------------------------------------------------------------

lujvo <- !gismu !fuhivla !cmavo initial-rafsi* brivla-core

brivla-core <- fuhivla / gismu / CVV-final-rafsi / stressed-initial-rafsi short-final-rafsi 

stressed-initial-rafsi <- stressed-extended-rafsi / stressed-y-rafsi / stressed-y-less-rafsi

initial-rafsi <- extended-rafsi / y-rafsi / !any-extended-rafsi y-less-rafsi !any-extended-rafsi

any-extended-rafsi <- fuhivla / extended-rafsi / stressed-extended-rafsi

;-------------------------------------------------------------------

fuhivla <- fuhivla-head stressed-syllable consonantal-syllable* final-syllable

stressed-extended-rafsi <- stressed-brivla-rafsi / stressed-fuhivla-rafsi 

extended-rafsi <- brivla-rafsi / fuhivla-rafsi

stressed-brivla-rafsi <- &unstressed-syllable brivla-head stressed-syllable h y

brivla-rafsi <- &(syllable consonantal-syllable* syllable) brivla-head h y h?

stressed-fuhivla-rafsi <- fuhivla-head stressed-syllable !h onset y 

fuhivla-rafsi <- &unstressed-syllable fuhivla-head !h onset y h?

fuhivla-head <- !rafsi-string brivla-head

brivla-head <- !cmavo !slinkuhi !h &onset unstressed-syllable*

slinkuhi <- !rafsi-string consonant rafsi-string 

rafsi-string <- y-less-rafsi* (gismu / CVV-final-rafsi / stressed-y-less-rafsi short-final-rafsi / y-rafsi / stressed-y-rafsi / stressed-y-less-rafsi? initial-pair y / hy-rafsi / stressed-hy-rafsi)

;-------------------------------------------------------------------

gismu <- (initial-pair stressed-vowel / consonant stressed-vowel consonant) &final-syllable consonant vowel &post-word

CVV-final-rafsi <- consonant stressed-vowel h &final-syllable vowel &post-word

short-final-rafsi <- &final-syllable (consonant diphthong / initial-pair vowel) &post-word

stressed-hy-rafsi <- (long-rafsi stressed-vowel / stressed-CCV-rafsi / stressed-CVV-rafsi) h y

stressed-y-rafsi <- (stressed-long-rafsi / stressed-CVC-rafsi) y

stressed-y-less-rafsi <- stressed-CVC-rafsi !y / stressed-CCV-rafsi / stressed-CVV-rafsi

stressed-long-rafsi <- initial-pair stressed-vowel consonant / consonant stressed-vowel consonant consonant

stressed-CVC-rafsi <- consonant stressed-vowel consonant 

stressed-CCV-rafsi <- initial-pair stressed-vowel 

stressed-CVV-rafsi <- consonant (unstressed-vowel h stressed-vowel / stressed-diphthong) r-hyphen? 


hy-rafsi <- (long-rafsi vowel / CCV-rafsi / CVV-rafsi) h y h?

y-rafsi <- (long-rafsi / CVC-rafsi) y h?

y-less-rafsi <- !y-rafsi !stressed-y-rafsi !hy-rafsi !stressed-hy-rafsi (CVC-rafsi / CCV-rafsi / CVV-rafsi) !h 

long-rafsi <- initial-pair unstressed-vowel consonant / consonant unstressed-vowel consonant consonant 

CVC-rafsi <- consonant unstressed-vowel consonant

CCV-rafsi <- initial-pair unstressed-vowel

CVV-rafsi <- consonant (unstressed-vowel h unstressed-vowel / unstressed-diphthong) r-hyphen?

r-hyphen <- r &consonant / n &r

;-------------------------------------------------------------------

final-syllable <-  onset !y !stressed nucleus !cmene &post-word

stressed-syllable <- &stressed syllable / syllable &stress

stressed-diphthong <- &stressed diphthong / diphthong &stress

stressed-vowel <- &stressed vowel / vowel &stress

unstressed-syllable <- !stressed syllable !stress / consonantal-syllable

unstressed-diphthong <- !stressed diphthong !stress

unstressed-vowel <- !stressed vowel !stress

stress <- consonant* h? y? syllable pause

stressed <- onset comma* [AEIOU]

any-syllable <- onset nucleus coda? / consonantal-syllable 

syllable <- onset !y nucleus coda?

consonantal-syllable <- consonant &syllabic coda

coda <- !any-syllable consonant &any-syllable / syllabic? consonant? &pause 

onset <-  h / glide / initial

nucleus <- vowel / diphthong / y !nucleus

;-----------------------------------------------------------------

glide <- (i / u) &nucleus

diphthong <- (a i !i / a u !u / e i !i / o i !i) !nucleus

vowel <- (a / e / i / o / u) !nucleus

a <- comma* [aA] 

e <- comma* [eE] 

i <- comma* [iI] 

o <- comma* [oO] 

u <- comma* [uU] 

y <- comma* [yY] 

;-------------------------------------------------------------------

cluster <- consonant consonant+

initial-pair <- &initial consonant consonant !consonant

initial <- (affricate / sibilant? other? liquid?) !consonant !glide

affricate <- t c / t s / d j / d z

liquid <- l / r 

other <- p / t !l / k / f / x / b / d !l / g / v / m / n !liquid 

sibilant <- c / s !x / (j / z) !n !liquid

consonant <- voiced / unvoiced / syllabic

syllabic <- l / m / n / r

voiced <- b / d / g / j / v / z

unvoiced <- c / f / k / p / s / t / x

l <- comma* [lL] !h !glide !l

m <- comma* [mM] !h !glide !m !z

n <- comma* [nN] !h !glide !n !affricate

r <- comma* [rR] !h !glide !r

b <- comma* [bB] !h !glide !b !unvoiced

d <- comma* [dD] !h !glide !d !unvoiced

g <- comma* [gG] !h !glide !g !unvoiced

v <- comma* [vV] !h !glide !v !unvoiced

j <- comma* [jJ] !h !glide !j !z !unvoiced

z <- comma* [zZ] !h !glide !z !j !unvoiced

s <- comma* [sS] !h !glide !s !c !voiced

c <- comma* [cC] !h !glide !c !s !x !voiced

x <- comma* [xX] !h !glide !x !c !k !voiced

k <- comma* [kK] !h !glide !k !x !voiced

f <- comma* [fF] !h !glide !f !voiced

p <- comma* [pP] !h !glide !p !voiced

t <- comma* [tT] !h !glide !t !voiced

h <- comma* ['h] &nucleus

;-------------------------------------------------------------------

digit <- comma* [0123456789] !h !nucleus

post-word <- pause / !nucleus lojban-word

pause <- comma* space-char+ / EOF

EOF <- comma* !.

comma <- [,]

non-lojban-word <- !lojban-word non-space+

NORATS non-space <- !space-char .

NORATS space-char <- [.\t\n\r?!\u0020]

;-------------------------------------------------------------------

spaces <- !Y initial-spaces

initial-spaces <- (comma* space-char / !ybu Y)+ EOF? / EOF

ybu <- Y space-char* BU

;-------------------------------------------------------------------

A <- &cmavo ( a / e / j i / o / u ) &post-word

BAI <- &cmavo ( d u h o / s i h u / z a u / k i h i / d u h i / c u h u / t u h i / t i h u / d i h o / j i h u / r i h a / n i h i / m u h i / k i h u / v a h u / k o i / c a h i / t a h i / p u h e / j a h i / k a i / b a i / f i h e / d e h i / c i h o / m a u / m u h u / r i h i / r a h i / k a h a / p a h u / p a h a / l e h a / k u h u / t a i / b a u / m a h i / c i h e / f a u / p o h i / c a u / m a h e / c i h u / r a h a / p u h a / l i h e / l a h u / b a h i / k a h i / s a u / f a h e / b e h i / t i h i / j a h e / g a h a / v a h o / j i h o / m e h a / d o h e / j i h e / p i h o / g a u / z u h e / m e h e / r a i ) &post-word

BAhE <- &cmavo ( b a h e / z a h e ) &post-word

BE <- &cmavo ( b e ) &post-word

BEI <- &cmavo ( b e i ) &post-word

BEhO <- &cmavo ( b e h o ) &post-word

BIhE <- &cmavo ( b i h e ) &post-word

BIhI <- &cmavo ( m i h i / b i h o / b i h i ) &post-word

BO <- &cmavo ( b o ) &post-word

BOI <- &cmavo ( b o i ) &post-word

BU <- &cmavo ( b u ) &post-word

BY <- ybu / &cmavo ( j o h o / r u h o / g e h o / j e h o / l o h a / n a h a / s e h e / t o h a / g a h e / y h y /  b y / c y / d y / f y / g y / j y / k y / l y / m y / n y / p y / r y / s y / t y / v y / x y / z y ) &post-word

CAhA <- &cmavo ( c a h a / p u h i / n u h o / k a h e ) &post-word

CAI <- &cmavo ( p e i / c a i / c u h i / s a i / r u h e ) &post-word

CEI <- &cmavo ( c e i ) &post-word

CEhE <- &cmavo ( c e h e ) &post-word

CO <- &cmavo ( c o ) &post-word

COI <- &cmavo ( j u h i / c o i / f i h i / t a h a / m u h o / f e h o / c o h o / p e h u / k e h o / n u h e / r e h i / b e h e / j e h e / m i h e / k i h e / v i h o ) &post-word

CU <- &cmavo ( c u ) &post-word

CUhE <- &cmavo ( c u h e / n a u ) &post-word

DAhO <- &cmavo ( d a h o ) &post-word

DOI <- &cmavo ( d o i ) &post-word

DOhU <- &cmavo ( d o h u ) &post-word

FA <- &cmavo ( f a i / f a / f e / f o / f u / f i h a / f i ) &post-word

FAhA <- &cmavo ( d u h a / b e h a / n e h u / v u h a / g a h u / t i h a / n i h a / c a h u / z u h a / r i h u / r u h u / r e h  o / t e h e / b u h u / n e h a / p a h o / n e h i / t o h o / z o h i / z e h o / z o h a / f a h a ) &post-word

FAhO <- &cmavo ( f a h o ) &post-word

FEhE <- &cmavo ( f e h e ) &post-word

FEhU <- &cmavo ( f e h u ) &post-word

FIhO <- &cmavo ( f i h o ) &post-word

FOI <- &cmavo ( f o i ) &post-word

FUhA <- &cmavo ( f u h a ) &post-word

FUhE <- &cmavo ( f u h e ) &post-word

FUhO <- &cmavo ( f u h o ) &post-word

GA <- &cmavo ( g e h i / g e /  g o / g a / g u ) &post-word

GAhO <- &cmavo ( k e h i / g a h o ) &post-word

GEhU <- &cmavo ( g e h u ) &post-word

GI <- &cmavo ( g i ) &post-word

GIhA <- &cmavo ( g i h e / g i h i / g i h o / g i h a / g i h u ) &post-word

GOI <- &cmavo ( n o h u / n e / g o i / p o h u / p e / p o h e / p o ) &post-word

GOhA <- &cmavo ( m o / n e i / g o h u / g o h o / g o h i / n o h a / g o h e / g o h a / d u / b u h a / b u h e / b u h i / c o h  e ) &post-word

GUhA <- &cmavo ( g u h e / g u h i / g u h o / g u h a / g u h u ) &post-word

I <- &cmavo ( i ) &post-word

JA <- &cmavo ( j e h i / j e /  j o / j a / j u ) &post-word

JAI <- &cmavo ( j a i ) &post-word

JOhI <- &cmavo ( j o h i ) &post-word

JOI <- &cmavo ( f a h u / p i h u / j o i / c e h o / c e / j o h u / k u h a / j o h e / j u h e ) &post-word

KE <- &cmavo ( k e ) &post-word

KEhE <- &cmavo ( k e h e ) &post-word

KEI <- &cmavo ( k e i ) &post-word

KI <- &cmavo ( k i ) &post-word

KOhA <- &cmavo ( d a h u / d a h e / d i h u / d i h e / d e h u / d e h e / d e i / d o h i / m i h o / m a h a / m i h a / d o h o  / k o h a / f o h u / k o h e / k o h i / k o h o / k o h u / f o h a / f o h e / f o h i / f o h o / v o h a / v o h e / v o h i /  v o h o / v o h u / r u / r i / r a / t a / t u / t i / z i h o / k e h a / m a / z u h i / z o h e / c e h u / d a / d e / d i / k  o / m i / d o ) &post-word

KU <- &cmavo ( k u ) &post-word

KUhE <- &cmavo ( k u h e ) &post-word

KUhO <- &cmavo ( k u h o ) &post-word

LA <- &cmavo ( l a i / l a h i / l a ) &post-word

LAU <- &cmavo ( c e h a / l a u / z a i / t a u ) &post-word

LAhE <- &cmavo ( t u h a / l u h a / l u h o / l a h e / v u h i / l u h i / l u h e ) &post-word

LE <- &cmavo ( l e i / l o i / l e h i / l o h i / l e h e / l o h e / l o / l e ) &post-word

LEhU <- &cmavo ( l e h u ) &post-word

LI <- &cmavo ( m e h o / l i ) &post-word

LIhU <- &cmavo ( l i h u ) &post-word

LOhO <- &cmavo ( l o h o ) &post-word

LOhU <- &cmavo ( l o h u ) &post-word

LU <- &cmavo ( l u ) &post-word

LUhU <- &cmavo ( l u h u ) &post-word

MAhO <- &cmavo ( m a h o ) &post-word

MAI <- &cmavo ( m o h o / m a i ) &post-word

ME <- &cmavo ( m e ) &post-word

MEhU <- &cmavo ( m e h u ) &post-word

MOhE <- &cmavo ( m o h e ) &post-word

MOhI <- &cmavo ( m o h i ) &post-word

MOI <- &cmavo ( m e i / m o i / s i h e / c u h o / v a h e ) &post-word

NA <- &cmavo ( j a h a / n a ) &post-word

NAI <- &cmavo ( n a i ) &post-word

NAhE <- &cmavo ( t o h e / j e h a / n a h e / n o h e ) &post-word

NAhU <- &cmavo ( n a h u ) &post-word

NIhE <- &cmavo ( n i h e ) &post-word

NIhO <- &cmavo ( n i h o / n o h i ) &post-word

NOI <- &cmavo ( v o i / n o i / p o i ) &post-word

NU <- &cmavo ( n i / d u h u / s i h o / n u / l i h i / k a / j e i / s u h u / z u h o / m u h e / p u h u / z a h i ) &post-word

NUhA <- &cmavo ( n u h a ) &post-word

NUhI <- &cmavo ( n u h i ) &post-word

NUhU <- &cmavo ( n u h u ) &post-word

PA <- &cmavo ( d a u / f e i / g a i / j a u / r e i / v a i / p i h e / p i /  f i h u / z a h u / m e h i / n i h u / k i h o / c e h i / m a h u / r a h e / d a h a / s o h a / j i h i / s u h o / s u h e / r o / r a u / s o h u / s o h i / s o h e / s o h o / m o h a / d u h e / t e h o / k a h o / c i h i / t u h o / x o / p a i / n o h o / n o / p a / r e / c i / v o / m u / x a / z e / b i / s o / digit ) &post-word

PEhE <- &cmavo ( p e h e ) &post-word

PEhO <- &cmavo ( p e h o ) &post-word

PU <- &cmavo ( b a / p u / c a ) &post-word

RAhO <- &cmavo ( r a h o ) &post-word

ROI <- &cmavo ( r e h u / r o i ) &post-word

SA <- &cmavo ( s a ) &post-word

SE <- &cmavo ( s e / t e / v e / x e ) &post-word

SEI <- &cmavo ( s e i / t i h o ) &post-word

SEhU <- &cmavo ( s e h u ) &post-word

SI <- &cmavo ( s i ) &post-word

SOI <- &cmavo ( s o i ) &post-word

SU <- &cmavo ( s u ) &post-word

TAhE <- &cmavo ( r u h i / t a h e / d i h i / n a h o ) &post-word

TEhU <- &cmavo ( t e h u ) &post-word

TEI <- &cmavo ( t e i ) &post-word

TO <- &cmavo ( t o h i / t o ) &post-word

TOI <- &cmavo ( t o i ) &post-word

TUhE <- &cmavo ( t u h e ) &post-word

TUhU <- &cmavo ( t u h u ) &post-word

UI <- &cmavo ( i h a / i e / a h e / u h i / i h o / i h e / a h a / i a / o h i / o h e / e h e / o i / u o / e h i / u h o / a u /  u a / a h i / i h u / i i / u h a / u i / a h o / a i / a h u / i u / e i / o h o / e h a / u u / o h a / o h u / u h u / e h o / i  o / e h u / u e / i h i / u h e / b a h a / j a h o / c a h e / s u h a / t i h e / k a h u / s e h o / z a h a / p e h i / r u h a  / j u h a / t a h o / r a h u / l i h a / b a h u / m u h a / d o h a / t o h u / v a h i / p a h e / z u h u / s a h e / l a h a /  k e h u / s a h u / d a h i / j e h u / s a h a / k a u / t a h u / n a h i / j o h a / b i h u / l i h o / p a u / m i h u / k u h  i / j i h a / s i h a / p o h o / p e h a / r o h i / r o h e / r o h o / r o h u / r o h a / r e h e / l e h o / j u h o / f u h i  / d a i / g a h i / z o h o / b e h u / r i h e / s e h i / s e h a / v u h e / k i h a / x u / g e h e / b u h o ) &post-word

VA <- &cmavo ( v i / v a / v u ) &post-word

VAU <- &cmavo ( v a u ) &post-word

VEI <- &cmavo ( v e i ) &post-word

VEhO <- &cmavo ( v e h o ) &post-word

VUhU <- &cmavo ( g e h a / f u h u / p i h i / f e h i / v u h u / s u h i / j u h u / g e i / p a h i / f a h i / t e h a / c u h a  / v a h a / n e h o / d e h o / f e h a / s a h o / r e h a / r i h o / s a h i / p i h a / s i h i ) &post-word

VEhA <- &cmavo ( v e h u / v e h a / v e h i / v e h e ) &post-word

VIhA <- &cmavo ( v i h i / v i h a / v i h u / v i h e ) &post-word

VUhO <- &cmavo ( v u h o ) &post-word

XI <- &cmavo ( x i ) &post-word

Y <- &cmavo ( y+ ) &post-word

ZAhO <- &cmavo ( c o h i / p u h o / c o h u / m o h u / c a h o / c o h a / d e h a / b a h o / d i h a / z a h o ) &post-word

ZEhA <- &cmavo ( z e h u / z e h a / z e h i / z e h e ) &post-word

ZEI <- &cmavo ( z e i ) &post-word

ZI <- &cmavo ( z u / z a / z i ) &post-word

ZIhE <- &cmavo ( z i h e ) &post-word

ZO <- &cmavo ( z o ) &post-word

ZOI <- &cmavo ( z o i / l a h o ) &post-word

ZOhU <- &cmavo ( z o h u ) &post-word