/* Contributed by Adam Shirey (adam.shirey@lexisnexis.com). Data originating from Project Gutenberg Feed's page (gutenberg.org/wiki/Gutenberg:Feeds). To Spray as XML, download the current Project Gutenberg catalog (direct download link: http://www.gutenberg.org/feeds/catalog.rdf.zip). Unzip catalog.rdf using Windows Explorer or a command-line zip utility. After loading catalog.rdf to your dropzone, select the appropriate machine/IP address and local path. Set the Row Tag to "pgterms:etext" and the label to "ecldata::in::gutenberg_catalog" and click Submit. */ layout_subjects := record string subject {xpath('dcterms:LCSH/rdf:value')}; end; layout := record, maxlength(10240) string publisher {xpath('dc:publisher')}; string title {xpath('dc:title')}; string description {xpath('dc:description')}; string tableOfContents {xpath('dc:tableOfContents')}; string creator {xpath('dc:creator')}; string friendlyTitle {xpath('pgterms:friendlytitle')}; string language {xpath('dc:language/dcterms:ISO639-2/value')}; string created {xpath('dc:created/dcterms:W3CDTF/rdf:value')}; string modified {xpath('dc:modified/dcterms:W3CDTF/rdf:value')}; string rights {xpath('dc:rights')}; string subject_lcsh {xpath('dc:subject/dcterms:LCSH/rdf:value')}; // library of congress subject heading; see en.wikipedia.org/wiki/Library_of_Congress_Subject_Headings string subject_lcc {xpath('dc:subject/dcterms:LCC/rdf:value')}; // library of congress classification; see en.wikipedia.org/wiki/Library_of_Congress_Classification dataset(layout_subjects) subjects {xpath('dc:subject/rdf:Bag/rdf:li')}; end; gutenberg := dataset( '~ecldata::in::gutenberg_catalog', layout, xml('rdf:RDF/pgterms:etext') ); // determine the top ten 'creator' values creator_info := table( gutenberg, {creator, n := count(group)}, creator ); creator_top10 := choosen( sort(creator_info,-n), 10 ); output(creator_top10, named('top10_creators')); // a sampling of titles based on their class and subclass output( gutenberg(subject_lcc='BQ'), named('buddhism') ); // Philosophy, Psychology, Relegion - Buddhism output( gutenberg(subject_lcc='DJ'), named('netherlands_history') ); // World History - Netherlands (Holland) output( gutenberg(subject_lcc='GV'), named('recreation') ); // Geography, Anthropology, Recreation - Recreation, Leisure output( gutenberg(subject_lcc='RB'), named('pathology') ); // Medicine - Pathology output( gutenberg(subject_lcc='TR'), named('photography') ); // Technology - Photography output( gutenberg(subject_lcc[1]='E' and subject_lcc[2..] between '456' and '655'), named('us_civil_war') ); // American History -- Civil War period, 1861-1865 // using HPCC to detect potential mis-categorized books, find records with a Library of Congress Classification of BF (psychology) that don't include the 'psych' prefix miscategorized := gutenberg( subject_lcc='BF', // records in the Philosophy, Psychology, Religion class, Psychology subclass not regexfind( '\\bpsych\\B', title, NOCASE ), // but the title doesn't contain a 'psych-' word (case-insensitive) not regexfind( '\\bpsych\\B', description, NOCASE ), // nor does the description not exists( subjects( regexfind( '\\bpsych\\B', subject, NOCASE ) ) ) // nor do any of the subjects ); output( miscategorized, named('miscategorized_psychology') );