cfnamegen.py:
A general discussion of this code can be found on my LiveJournal.
# Context-free grammar random name generator # Jeremy Thurgood <jerith@is.und.ac.za> # Highly experimental at present, but sort of working import random import re import sys # This should be done using gettext for i18n, but I can't be bothered to figure # out how to do it properly, so I'm using replacement strings for now. stringUndefinedNonTerminal = "Undefined non-terminal \"%(undefinedNonTerminal)s\" in rule \"%(rule)s\"." # Test grammar -- will be read from a file when I decide how to do it properly # with minimum effort (for the user and the code) orkGrammar = { "name": ["<nameStart><nameMiddle0to3><nameEnd>"], "nameMiddle0to3": ["","<nameMiddle>", "<nameMiddle><nameMiddle>", "<nameMiddle><nameMiddle><nameMiddle>"], "nameStart": ["<nsCons><nmVowel>", "<nsCons><nmVowel>", "<nsCons><nmVowel>", "<nsVowel>"], "nameMiddle": ["<nmCons><nmVowel>"], "nameEnd": ["<neCons><neVowel>", "<neCons>", "<neCons>"], "nsCons": ["D", "G", "K", "T", "Gr"], "nmCons": ["d", "g", "k", "t", "r", "s", "z", "kt", "rs", "gr"], "neCons": ["r", "s", "z"], "nsVowel": ["E", "U"], "nmVowel": ["a", "e", "i", "o", "u"], "neVowel": ["a", "u"] } fooGrammar = { "name": ["<nameStart><nameMiddle0to2><nameEnd>"], "nameMiddle0to2": ["","<nameMiddle>", "<nameMiddle><nameMiddle>"], "nameStart": ["<nsCons><nmVowel>", "<nsCons><nmVowel>", "<nsCons><nmVowel>", "<nsVowel>"], "nameMiddle": ["<nmCons><nmVowel>"], "nameEnd": ["<neCons><neVowel>", "<neCons>", "<neCons>"], "nsCons": ["J", "M", "P", "N", "Y", "D", "F"], "nmCons": ["l", "m", "lm", "th", "r", "s", "ss", "p", "f", "mb", "b", "lb", "d", "lf"], "neCons": ["r", "n", "m", "s", "y", "l", "th", "b", "lb", "f", "lf"], "nsVowel": ["A", "Au", "Ei"], "nmVowel": ["a", "e", "i", "o", "u", "au", "oa", "ei"], "neVowel": ["e", "i", "a", "au"] } # Regular expression to catch non-terminals, used frequently, so global reNonTerminal = re.compile(r"<(\w+)>") # checkTypes() is only useful while testing with internally specified grammars. # Once we're parsing an external file it becomes unnecessary since we generate # the data types ourselves instead of asking a human to do it. As such, error # strings are hardcoded. Anyone who sees them would be messing around in here # anyway. def checkTypes(nameGrammar): """Check given grammar object for correct datatypes. This function is only really necessary while the grammar's still being specified in here. It will likely disappear when we parse the grammar from a data file. """ if not isinstance(nameGrammar, dict): return "Grammar data is not a dictionary!" for rule, rhs in nameGrammar.items(): if not isinstance(rhs, list): return "Rule \"%s\" is not a list!" % rule for option in rhs: if not isinstance(option, str): return "Rule \"%s\" does not contain only strings!" % rule # Grammar verification stuff follows. We can probably make this throw warnings # and correct problems, but that's a job for another day. Incorrect grammars # probably won't provide useful output anyway. If this stuff gets big enough # it may be pushed into its own module. def checkUndefinedNonTerminals(nameGrammar): """Check given grammar for undefined non-terminals. An undefined non-terminal is a non-terminal symbol used in a symbol definition that has no definition of its own and cannot therefore be expanded. Undefined non-terminals can lead to ugly error messages instead of beautifully generated names. """ for rule, rhs in nameGrammar.items(): for option in rhs: tempStr = option matchNonTerminal = reNonTerminal.search(tempStr) while matchNonTerminal: if matchNonTerminal.group(1) not in nameGrammar: return {"undefinedNonTerminal": matchNonTerminal.group(1), "rule": rule} tempStr = reNonTerminal.sub("", tempStr, 1) matchNonTerminal = reNonTerminal.search(tempStr) def checkUnproductiveNonTerminals(nameGrammar): """Check grammar for possibly unproductive non-terminals. An unproductive non-terminal is a non-terminal symbol that cannot be converted to a terminal symbol in the given grammar. A good example of this is a non-terminal symbol that includes itself in its definition. This function is currently very basic and should be extended (rewritten?) to allow warnings for _possible_ unproductive non-terminals and errors for _definite_ unproductive non-terminals. Volunteers? XXX: INCOMPLETE """ def recurse(a): if a == 5: return a return recurse(a+1) grammarUnchecked = dict([(rule, "".join(rhs)) for (rule, rhs) in nameGrammar.items()]) grammarProductive = [] finished = False while not finished: print "grammarProductive:" print grammarProductive print "grammarUnchecked:" print grammarUnchecked print finished = True for rule, rhs in grammarUnchecked.items(): matchNonTerminal = reNonTerminal.search(rhs) while matchNonTerminal: matchString = matchNonTerminal.group(1) if matchString not in grammarProductive: break rhs = rhs.replace("<"+matchString+">", "") finished = False matchNonTerminal = reNonTerminal.search(rhs) if not matchNonTerminal: grammarProductive.append(rule) del grammarUnchecked[rule] finished = False continue grammarUnchecked[rule] = rhs # More grammar checking functions to come: # Unused non-terminals # Loop detection would be nice, but currently a little impractical. def checkUnusedNonTerminals(nameGrammar): """Check grammar for non-terminals that can never be reached. While unused non-terminals are irrelevant in the generation of sentences, their presence usually implies an error in the grammar. XXX: INCOMPLETE """ pass # verifyGrammar() uses the above functions to verify the correctness of a # grammar. This isn't perfect, but it should catch the most common problems. def verifyGrammar(nameGrammar): error = checkTypes(nameGrammar) if error: return error error = checkUndefinedNonTerminals(nameGrammar) if error: return stringUndefinedNonTerminal % error if "name" not in nameGrammar: return "Rule \"name\" not present!" # Now to the meat of the problem, which is actually almost trivial thanks to # the dictionary data type. I love python ;-) def nameGen(nameGrammar): nameStr = random.choice(nameGrammar["name"]) matchNonTerminal = reNonTerminal.search(nameStr) while matchNonTerminal: subStr = random.choice(nameGrammar[matchNonTerminal.group(1)]) nameStr = reNonTerminal.sub(subStr, nameStr, 1) matchNonTerminal = reNonTerminal.search(nameStr) return nameStr # Main body checkUnproductiveNonTerminals(fooGrammar) sys.exit() errorStr = verifyGrammar(fooGrammar) if errorStr: sys.exit(errorStr) print nameGen(fooGrammar)
And that's it. There's more to be done, but I've lost interest in this project for a while. Perhaps I'll come back to it later.