Project Gutenberg ebook feed analysis

This sample code uses Project Gutenberg‘s feeds and performs some basic querying and analysis for potential mis-categorized books.

/*

Contributed by Adam Shirey (adam.shirey@lexisnexis.com).
Data originating from Project Gutenberg Feed's page (gutenberg.org/wiki/Gutenberg:Feeds).

To Spray as XML, download the current Project Gutenberg catalog (direct download link: http://www.gutenberg.org/feeds/catalog.rdf.zip).
Unzip catalog.rdf using Windows Explorer or a command-line zip utility.
After loading catalog.rdf to your dropzone, select the appropriate machine/IP address and local path.
Set the Row Tag to "pgterms:etext" and the label to "ecldata::in::gutenberg_catalog" and click Submit.
*/


layout_subjects := record
@nbsp;@nbsp; string subject {xpath('dcterms:LCSH/rdf:value')};
end;
layout := record, maxlength(10240)
@nbsp; string publisher {xpath('dc:publisher')};
@nbsp; string title {xpath('dc:title')};
@nbsp; string description {xpath('dc:description')};
@nbsp; string tableOfContents {xpath('dc:tableOfContents')};
@nbsp; string creator {xpath('dc:creator')};
@nbsp; string friendlyTitle {xpath('pgterms:friendlytitle')};
@nbsp; string language {xpath('dc:language/dcterms:ISO639-2/value')};
@nbsp; string created {xpath('dc:created/dcterms:W3CDTF/rdf:value')};
@nbsp; string modified {xpath('dc:modified/dcterms:W3CDTF/rdf:value')};
@nbsp; string rights {xpath('dc:rights')};
@nbsp; string subject_lcsh {xpath('dc:subject/dcterms:LCSH/rdf:value')}; // library of congress subject heading; see en.wikipedia.org/wiki/Library_of_Congress_Subject_Headings
@nbsp; string subject_lcc {xpath('dc:subject/dcterms:LCC/rdf:value')}; // library of congress classification; see en.wikipedia.org/wiki/Library_of_Congress_Classification
@nbsp; dataset(layout_subjects) subjects {xpath('dc:subject/rdf:Bag/rdf:li')};
end;

gutenberg := dataset( '~ecldata::in::gutenberg_catalog', layout, xml('rdf:RDF/pgterms:etext') );

// determine the top ten 'creator' values
creator_info := table( gutenberg, {creator, n := count(group)}, creator );
creator_top10 := choosen( sort(creator_info,-n), 10 );
output(creator_top10, named('top10_creators'));

// a sampling of titles based on their class and subclass
output( gutenberg(subject_lcc='BQ'), named('buddhism') ); // Philosophy, Psychology, Relegion - Buddhism
output( gutenberg(subject_lcc='DJ'), named('netherlands_history') ); // World History - Netherlands (Holland)
output( gutenberg(subject_lcc='GV'), named('recreation') ); // Geography, Anthropology, Recreation - Recreation, Leisure
output( gutenberg(subject_lcc='RB'), named('pathology') ); // Medicine - Pathology
output( gutenberg(subject_lcc='TR'), named('photography') ); // Technology - Photography
output( gutenberg(subject_lcc[1]='E' and subject_lcc[2..] between '456' and '655'), named('us_civil_war') ); // American History -- Civil War period, 1861-1865

// using HPCC to detect potential mis-categorized books, find records with a Library of Congress Classification of BF (psychology) that don't include the 'psych' prefix
miscategorized := gutenberg(
@nbsp; subject_lcc='BF', // records in the Philosophy, Psychology, Religion class, Psychology subclass
@nbsp; not regexfind( '\\bpsych\\B', title, NOCASE ), // but the title doesn't contain a 'psych-' word (case-insensitive)
@nbsp; not regexfind( '\\bpsych\\B', description, NOCASE ), // nor does the description
@nbsp; not exists( subjects( regexfind( '\\bpsych\\B', subject, NOCASE ) ) ) // nor do any of the subjects
);
output( miscategorized, named('miscategorized_psychology') );