Home > Community > Contributions > Data Descriptors and Simple Example Programs > Project Gutenberg ebook feed analysis

Project Gutenberg ebook feed analysis

This sample code uses Project Gutenberg‘s feeds and performs some basic querying and analysis for potential mis-categorized books.

/*

Contributed by Adam Shirey (adam.shirey@lexisnexis.com).

Data originating from Project Gutenberg Feed's page (gutenberg.org/wiki/Gutenberg:Feeds).



To Spray as XML, download the current Project Gutenberg catalog (direct download link: http://www.gutenberg.org/feeds/catalog.rdf.zip).

Unzip catalog.rdf using Windows Explorer or a command-line zip utility.

After loading catalog.rdf to your dropzone, select the appropriate machine/IP address and local path.

Set the Row Tag to "pgterms:etext" and the label to "ecldata::in::gutenberg_catalog" and click Submit.

*/





layout_subjects := record

@nbsp;@nbsp;	string subject {xpath('dcterms:LCSH/rdf:value')};

end;

layout := record, maxlength(10240)

@nbsp;	string publisher       {xpath('dc:publisher')};

@nbsp;	string title           {xpath('dc:title')};

@nbsp;	string description     {xpath('dc:description')};

@nbsp;	string tableOfContents {xpath('dc:tableOfContents')};

@nbsp;	string creator         {xpath('dc:creator')};

@nbsp;	string friendlyTitle   {xpath('pgterms:friendlytitle')};

@nbsp;	string language        {xpath('dc:language/dcterms:ISO639-2/value')};

@nbsp;	string created         {xpath('dc:created/dcterms:W3CDTF/rdf:value')};

@nbsp;	string modified        {xpath('dc:modified/dcterms:W3CDTF/rdf:value')};

@nbsp;	string rights          {xpath('dc:rights')};

@nbsp;	string subject_lcsh    {xpath('dc:subject/dcterms:LCSH/rdf:value')}; // library of congress subject heading; see en.wikipedia.org/wiki/Library_of_Congress_Subject_Headings

@nbsp;	string subject_lcc     {xpath('dc:subject/dcterms:LCC/rdf:value')}; // library of congress classification; see en.wikipedia.org/wiki/Library_of_Congress_Classification

@nbsp;	dataset(layout_subjects) subjects {xpath('dc:subject/rdf:Bag/rdf:li')};

end;



gutenberg := dataset( '~ecldata::in::gutenberg_catalog', layout, xml('rdf:RDF/pgterms:etext') );



// determine the top ten 'creator' values

creator_info := table( gutenberg, {creator, n := count(group)}, creator );

creator_top10 := choosen( sort(creator_info,-n), 10 );

output(creator_top10, named('top10_creators'));



// a sampling of titles based on their class and subclass

output( gutenberg(subject_lcc='BQ'), named('buddhism') ); // Philosophy, Psychology, Relegion - Buddhism

output( gutenberg(subject_lcc='DJ'), named('netherlands_history') ); // World History - Netherlands (Holland)

output( gutenberg(subject_lcc='GV'), named('recreation') ); // Geography, Anthropology, Recreation - Recreation, Leisure

output( gutenberg(subject_lcc='RB'), named('pathology') ); // Medicine - Pathology

output( gutenberg(subject_lcc='TR'), named('photography') ); // Technology - Photography

output( gutenberg(subject_lcc[1]='E' and subject_lcc[2..] between '456' and '655'), named('us_civil_war') ); // American History -- Civil War period, 1861-1865



// using HPCC to detect potential mis-categorized books, find records with a Library of Congress Classification of BF (psychology) that don't include the 'psych' prefix

miscategorized := gutenberg(

@nbsp;	subject_lcc='BF', // records in the Philosophy, Psychology, Religion class, Psychology subclass

@nbsp;	not regexfind( '\\bpsych\\B', title, NOCASE ), // but the title doesn't contain a 'psych-' word (case-insensitive)

@nbsp;	not regexfind( '\\bpsych\\B', description, NOCASE ), // nor does the description

@nbsp;	not exists( subjects( regexfind( '\\bpsych\\B', subject, NOCASE ) ) ) // nor do any of the subjects

);

output( miscategorized, named('miscategorized_psychology') );

Getting Started with HPCC Systems

Getting Started with HPCC Systems

Let’s get started

Detailed documentation

Detailed documentation

Detailed documentation

Check out the Wiki

HPCC Systems Training

HPCC Systems Training

HPCC Systems Training

HPCC Systems Training

Welcome to the HPCC Systems developer community!

Welcome to the HPCC Systems developer community!

Welcome to the HPCC Systems developer community!

Welcome to the HPCC Systems developer community!

Welcome to the HPCC Systems developer community!

Welcome to the HPCC Systems developer community!

Welcome to the HPCC Systems developer community!

Welcome to the HPCC Systems developer community!

Project Gutenberg ebook feed analysis