This example parses raw phone numbers from a specific field in an input dataset into a single standard output containing just the numbers. A missing area code in the raw input results in three leading zeroes in the output.
infile := DATASET([{'5619994581'},{'15619994581'}, {'(561) 999-4581'},{'(561)999-4581'}, {'561-999-4581'},{'561 999 4581'}, {'561.999.4581'},{'561/999/4581'}, {'561 999-4581'},{'9994581'}, {'999-4581'}],{STRING20 rawnumber}); PATTERN numbers := PATTERN('[0-9]')+; PATTERN alpha := PATTERN('[A-Za-z]')+; PATTERN ws := [' ','\t']*; PATTERN sepchar := PATTERN('[-./ ]'); PATTERN Seperator := ws sepchar ws; // Area Code PATTERN OpenParen := ['[','(','{','<']; PATTERN CloseParen := [']',')','}','>']; PATTERN FrontDigit := ['1', '0'] OPT(Seperator); PATTERN areacode := OPT(FrontDigit) OPT(OpenParen) numbers length(3) OPT(CloseParen); // Last Seven digits PATTERN exchange := numbers length(3); PATTERN lastfour := numbers length(4); PATTERN seven := exchange OPT(Seperator) lastfour; // Extension PATTERN extension := ws alpha ws numbers; // Phone Number PATTERN phonenumber := OPT(areacode) OPT(Seperator) seven opt(extension) ws; layout_phone_append := RECORD infile; STRING10 clean_phone := MAP(NOT MATCHED(phonenumber) => '', NOT MATCHED(areacode) => '000' + MATCHTEXT(exchange) + MATCHTEXT(lastfour), MATCHTEXT(areacode/numbers) + MATCHTEXT(exchange) + MATCHTEXT(lastfour)); END; outfile := PARSE(infile, rawnumber, phonenumber, layout_phone_append,FIRST, NOT MATCHED, WHOLE); OUTPUT(outfile);
This example parses a small subset of raw movie data into standard database fields:
IMPORT Std; Layout_Actors_Raw := RECORD STRING120 IMDB_Actor_Desc; END; File_Actors := DATASET([ {'A.V., Subba Rao Chenchu Lakshmi (1958/I) <10>'}, {' Jayabheri (1959) <17>'}, {' Madalasa (1948) <3>'}, {' Mangalya Balam (1958) <12>'}, {' Mohini Bhasmasura (1938) <3>'}, {' Palletoori Pilla (1950) [Kampanna Dora] <4>'}, {' Peddamanushulu (1954) <6>'}, {' Sarangadhara (1957) <12>'}, {' Sri Seetha Rama Kalyanam (1961) <12>'}, {' Sri Venkateswara Mahatmyam (1960) [Akasa Raju] <5>'}, {' Vara Vikrayam (1939) [Judge] <12>'}, {' Vindhyarani (1948) <7>'}, {''}, {'Aa, Brynjar Adjo solidaritet (1985) [Ponker] <40>'}, {''}, {'Aabel, Andreas Bor Borson Jr. (1938) [O.G. Hansen] <9>'}, {' Jeppe pa bjerget (1933) [En skomakerlaerling]'}, {' Kampen om tungtvannet (1948) <8>'}, {' Prinsessen som ingen kunne maqlbinde (1932) [Espen Askeladd] <3>'}, {' Spokelse forelsker seg, Et (1946) [Et spokelse] <6>'}, {''}, {'Aabel, Hauk (I) Alexander den store (1917) [Alexander Nyberg]'}, {' Du har lovet mig en kone! (1935) [Professoren] <6>'}, {' Glad gutt, En (1932) [Ola Nordistua] <1>'}, {' Jeppe pa bjerget (1933) [Jeppe] <1>'}, {' Morderen uten ansikt (1936)'}, {' Store barnedapen, Den (1931) [Evensen, kirketjener] <5>'}, {' Troll-Elgen (1927) [Piper, direktor] <9>'}, {' Ungen (1938) [Krestoffer] <8>'}, {' Valfangare (1939) [Jensen Sr.] <4>'}, {''}, {'Aabel, Per (I) Brudebuketten (1953) [Hoyland jr.] <3>'}, {' Cafajestes, Os (1962)'}, {' Farlige leken, Den (1942) [Fredrik Holm, doktor]'}, {' Herre med bart, En (1942) [Ole Grong, advokat] <1>'}, {' Kjaere Maren (1976) [Doktor]'}, {' Kjaerlighet og vennskap (1941) [Anton Schack] <3>'}, {' Ombyte fornojer (1939) [Gregor Ivanow] <2>'}, {' Portrettet (1954) [Per Haug, provisor] <1>'}], Layout_Actors_Raw); //Basic patterns: PATTERN arb := PATTERN('[-!.,\t a-zA-Z0-9]')+; //all alphanumeric & certain special characters PATTERN ws := [' ','\t']+; //word separators (space & tab) PATTERN number := PATTERN('[0-9]')+; //numbers //extended patterns: PATTERN age := '(' number OPT('/I') ')'; //movie year -- OPT('/I') required for first rec PATTERN role := '[' arb ']'; //character played PATTERN m_rank := '<' number '>'; //credit appearance number PATTERN actor := arb OPT(ws '(I)' ws); //actor's name -- OPT(ws '(I)' ws) // required for last two actors //extended pattern to parse the actual text: PATTERN line := actor '\t' arb ws OPT(age) ws OPT(role) ws OPT(m_rank) ws; //output record structure: NLP_layout_actor_movie := RECORD STRING30 actor_name := Std.Str.filterout(MATCHTEXT(actor),'\t'); STRING50 movie_name := MATCHTEXT(arb[2]); UNSIGNED2 movie_year := (UNSIGNED)MATCHTEXT(age/number); STRING20 movie_role := MATCHTEXT(role/arb); UNSIGNED1 cast_rank := (UNSIGNED)MATCHTEXT(m_rank/number); END; //and the actual parsing operation Actor_Movie_Init := PARSE(File_Actors, IMDB_Actor_Desc, line, NLP_layout_actor_movie,WHOLE,FIRST); // then iterate to propagate actor name in each record NLP_layout_actor_movie IterNames(NLP_layout_actor_movie L, NLP_layout_actor_movie R) := TRANSFORM SELF.actor_name := IF(R.actor_Name='',L.actor_Name,R.actor_name); SELF:= R; END; NLP_Actor_Movie := ITERATE(Actor_Movie_Init,IterNames(LEFT,RIGHT)); // and output the result set OUTPUT(NLP_Actor_Movie);
An example of Tomita Parsing (using SELF):
//an example of Tomita Parsing (using SELF): r1 := RECORD STRING value; END; ds := DATASET([{'1'},{'222+33*(1+2+(1))'}], r1); TOKEN val := PATTERN('[0-9]')+; RULE expr := SELF '*' SELF | SELF '+' SELF | SELF '(' SELF ')' | val; parsed_record := RECORD STRING result := MATCHTEXT(expr); END; PARSE(ds, value, expr, parsed_record,PARSE);
Another example of Tomita parsing:
// This example demonstrates the use of productions in PARSE code //(only supported in the Tomita version of PARSE) PATTERN ws := ['','\t']; TOKEN number := PATTERN('[0-9]+'); TOKEN plus := '+'; TOKEN minus := '-'; attrRec := RECORD //record structure for Tomita parsing INTEGER val; END; RULE(attrRec) e0 := '(' USE(attrRec,expr)? ')' //USE = forward reference to "expr" definition | number TRANSFORM(attrRec, SELF.val := (INTEGER)$1;) | minus SELF TRANSFORM(attrRec, SELF.val := -$2.val;); RULE(attrRec) e1 := e0 | SELF '*' e0 TRANSFORM(attrRec, SELF.val := $1.val * $3.val;) | SELF '/' e0 TRANSFORM(attrRec, SELF.val := $1.val / $3.val;); RULE(attrRec) e2 := e1 | SELF plus e1 TRANSFORM(attrRec, SELF.val := $1.val + $3.val;) | SELF minus e1 TRANSFORM(attrRec, SELF.val := $1.val - $3.val;); RULE(attrRec) expr := e2; infile := DATASET([{'1+2*3'},{'1+2*100'},{'1+2+(3+4)*4/2'},{'-4*5'}], { STRING line }); resultsRec := RECORD RECORDOF(infile); attrRec; STRING exprText; INTEGER value3; END; resultsRec extractResults(infile L, attrRec attr) := TRANSFORM SELF := L; SELF := attr; SELF.exprText := MATCHTEXT; SELF.value3 := MATCHROW(e0[3]).val; END; OUTPUT(PARSE(infile,line,expr,extractResults(LEFT, $1),FIRST,WHOLE,PARSE,SKIP(ws+)));