archives

An attempt to make languages first class citizens, starting with the syntax

Hi all,

I've been playing recently with the idea of making languages first class citizens in the platform, starting with the syntax, on top of an underlying, executable type system, such as .NET's CTS implementation in the CLR.

Note the natural language parsing examples which follow are not of the target ontology of discussion : these are just introductory examples from the same "topic neighborhood", though (grammars). Eventually, the actual subject to contemplate will be the design of PLs, modeling languages, and DSLs.

Anyway, so here's what I've got working so far (warning: pretty unpolished and subject to change; mostly a mere proof-of-concept, still) :

http://www.ysharp.net/2010/09/OnTheWayToYSharp/ndoc

http://www.ysharp.net/2010/09/OnTheWayToYSharp/PEGTest_r5.zip

(Please also read the copying permission notice : http://www.ysharp.net/2010/09/OnTheWayToYSharp/COPYING.txt)

Which enables the following, for instance :

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

using System.Languages;

namespace PEGTest
{
    public class NaturalLanguage : Language<Language.Void>
    {
        // (code omitted)
    }

    public class English : Language<NaturalLanguage>
    {
        // (code omitted)
    }

    public class StandardEnglish : Language<English>
    {
        public static readonly StandardEnglish Language = StandardEnglish.Coin<StandardEnglish>();

        public StandardEnglish() : this(null) { }

        public StandardEnglish(Select chain)
            : base(chain)
        {
            Semantic = Evaluate;
        }
	
        protected override Select Define()
        {
            var SPACE = Let.Regex("SPACE", @"\s+");
            var OPTSPACE = Let.Opt(SPACE);
            var HYPHEN = Let.Token("HYPHEN", "-");
            var PERIOD = Let.Token("PERIOD", ".");
            var A = Let.Token("A", "a");
            var THE = Let.Token("THE", "the");
            var BOY = Let.Token("BOY", "boy");
            var NP = Let.Or("Noun Phrase", Let.Seq("Noun Group", Det, SPACE, OptAdjNoun), OptAdjNoun);

            // other rules omitted...

            var VP = Let.Or(Let.Seq("Verb Group", Let.Expect(OptAdvVerb), SPACE, Let.Expect(NP)), OptAdvVerb);

            var phrase = Let.Seq("Phrase", Let.Expect(NP), Let.Expect(Let.Seq("Verb Phrase", SPACE, VP)));

            var sentence = Let.Seq("Sentence", Let.Expect(phrase), Let.Expect(Let.Seq("Period", OPTSPACE, PERIOD)), OPTSPACE);

            return Let.Seq("syntax", OPTSPACE, Let.Expect(Let.Some("Sentences", sentence)));
        }
    }
}

and

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

using System.Languages;

namespace PEGTest
{
    /// <summary>
    /// ExtendedEnglish gives an example of dynamic grammar refinement at...
    /// ... parse time, triggered by the parse context itself,
    /// when 'so-called' is used in noun phrases.
    /// 
    /// (Implemented in ExtendedEnglish.Evaluate method.)
    /// </summary>
    public class ExtendedEnglish : Language<English>
    {
        public static readonly ExtendedEnglish Language = ExtendedEnglish.Coin<ExtendedEnglish>(StandardEnglish.Language);

        public ExtendedEnglish() : this(null) { }

        public ExtendedEnglish(Select chain)
            : base(chain)
        {
            Semantic = Evaluate;
        }

        private bool amended;

        protected override Value Evaluate(string select, Value value)
        {
            if ((value.Ident == "SO_CALLED") && !amended)
            {
                Amend(
                    delegate(Select let)
                    {
                        var CONNOISSEUR = let.Token("CONNOISSEUR", "connoisseur");
                        var RENDEZ = let.Token("RENDEZ", "rendez");
                        var VOUS = let.Token("VOUS", "vous");
                        var RENDEZ_VOUS = let.Seq("RENDEZ_VOUS", RENDEZ, let.Or(let["@HYPHEN"], let["@SPACE"]), VOUS);
                        var ForeignNoun = let.Or("Foreign Noun", CONNOISSEUR, RENDEZ_VOUS);
                        return null;
                    },
                    delegate(Select let)
                    {
                        var CHIC = let.Token("CHIC", "chic");
                        var PETITE = let.Token("PETITE", "petite");
                        var ForeignAdj = let.Or("Foreign Adj", CHIC, PETITE);
                        return null;
                    }
                );
                amended = true;
            }
            return value;
        }
    }
}

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

using System.Languages;

namespace PEGTest
{
    /// <summary>
    /// Frenglish is just StandardEnglish statically extended with the same ExtendedEnglish's amendments built-in via
    /// [Syntax]-marked methods, instead of behind dynamically applied using Amend(...).
    /// 
    /// (Implemented in Frenglish.FrenchNoun and Frenglish.FrenchAdj methods.)
    /// </summary>
    public class Frenglish : StandardEnglish
    {
        public static readonly Frenglish Language = Frenglish.Coin<Frenglish>();

        public Frenglish() : this(null) { }

        public Frenglish(Select chain) : base(chain) { }

        [Syntax]
        private Select FrenchNoun()
        {
            var CONNOISSEUR = Let.Token("CONNOISSEUR", "connoisseur");
            var RENDEZ = Let.Token("RENDEZ", "rendez");
            var VOUS = Let.Token("VOUS", "vous");
            var RENDEZ_VOUS = Let.Seq("RENDEZ_VOUS", RENDEZ, Let.Or(Let["@HYPHEN"], Let["@SPACE"]), VOUS);
            var ForeignNoun = Let.Or("Foreign Noun", CONNOISSEUR, RENDEZ_VOUS);
            return null;
        }

        [Syntax]
        private Select FrenchAdj()
        {
            var CHIC = Let.Token("CHIC", "chic");
            var PETITE = Let.Token("PETITE", "petite");
            var ForeignAdj = Let.Or("Foreign Adj", CHIC, PETITE);
            return null;
        }
    }
}

and finally :

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

using System.Languages;

namespace PEGTest
{
    class Program
    {
        static void Main(string[] args)
        {
            Language language;
            bool en, en2, fg, fr, mt, ys;
            ConsoleKeyInfo keyInfo;
            Console.WriteLine();
            Console.WriteLine("Choose your language :");

            // dull code omitted...

            var empty = Language.Empty; // note this works, too: var empty = Language.Void.Language
            var english = StandardEnglish.Language;
            var extended = ExtendedEnglish.Language;
            var frenglish = Frenglish.Language;
            var french = French.Language;

            // dull code omitted...

            while ((keyInfo.KeyChar != 'X') && (keyInfo.KeyChar != 'x'))
            {
                // dull code omitted...

                while (!String.IsNullOrEmpty(line = Console.ReadLine()))
                    text += String.Concat(line, "\r\n");
                text = text.TrimEnd();

                parsed = language.Parse(text);

                if (language is Language<English>)
                {
                    Console.WriteLine("Syntax analysis of {0} input :", language.GetType().FullName);
                    Console.WriteLine();
                }

                if (parsed != null)
                {
                    PrettyPrint(parsed);
                }
                else
                    Console.WriteLine("unrecoverable syntax error");
                Console.WriteLine("Press 'X' to exit or any other key to continue...");
                keyInfo = Console.ReadKey();
            }
        }
    }
}

Then, at run time in the PoC's Main method above :

"The connoisseur has a rendez-vous." => results in parse errors on "connoisseur" and "rendez-vous", neither known to these grammars : StandardEnglish and ExtendedEnglish. It parses OK right away in the case of Frenglish, which does know about these nouns in its syntax-defining methods (these two above marked with the [Syntax] custom attribute).

However, it's more interesting in the specific case of ExtendedEnglish (originally equivalent to StandardEnglish), as we also have the following specific behavior about it (& only it) :

"The so-called connoisseur has a sweet rendez-vous." => does parse OK, thanks to the application of grammar refinements/amendments triggered by the parse context itself, when encountering the occurrence of "so-called" in the input.

These dynamic refinements/amendments to ExtendedEnglish from and to itself, performed at parse-time (well, yes!) are implemented in that ExtendedEnglish.Evaluate method (see above).

I know I'm going to need help for the next steps I contemplate to build upon this.

Is anyone interested in joining in my effort? (If so, feel free to get in touch at ysharp {dot} design {at} gmail {dot} com)

Note this is also related to the following :

http://www.ysharp.net/the.language/rationale

and

http://www.ysharp.net/the.language/design/?p=93

Thanks, &

Cordially,
Cyril