/* Name : Chris Albee Email : christopher.albee@lexisnexis.com Data Source: The NHTSA Complaints database contains complaints reported to the National Highway Transportation Safety Administration (NHTSA) by consumers for nearly the last twenty years. It consists of over 850,000 records and includes complaints regarding not only automobiles, but also automobile accessories. The URL for the data source is: http://www-odi.nhtsa.dot.gov/downloads/folders/Complaints/FLAT_CMPL.zip */ /* ********************************************* Downloading and spraying the data: ********************************************* */ /* Before you begin: ensure you have installed VMWare and the HPCC ECL IDE on your system (see: runninghpccinavirtualmachine.pdf , available at http://hpccsystems.com/community/docs ). */ /* 1. Navigate to the URL where the data source file is located (see URL above), and unzip FLAT_CMPL.txt, saving to disk. 2. Upload FLAT_CMPL.txt to your default dropzone (see: "Uploading a File", found in data_handling.pdf, pp. 6-7, available at http://hpccsystems.com/community/docs ) . 3. Spray FLAT_CMPL.txt from your default dropzone to a Thor cluster (see: "Spray CSV", found in data_handling.pdf, pp. 13-14). 3a. In the Local Path textbox, enter the following path: /var/lib/HPCCSystems/mydropzone/FLAT_CMPL.txt 3b. In the Format dropdown list, ensure that 'ASCII' is selected. 3c. In the Max Record Length textbox, ensure that it is blank. Yes, blank. 3d. In the Seperator textbox, enter the following regular expression denoting tab-separation: \t 3e. In the Line Terminator textbox, enter the following regular expression: \n,\r\n 3f. In the Quote textbox, enter a single quotation mark: ' 3g. In the Group dropdown list, ensure that 'mythor' is selected. 3h. In the Label textbox, enter the following filename: ~mythor::ECLDATA::IN::NHTSA_complaints 3i. Ensure that the Overwrite and Replicate checkboxes are checked. 3j. Click Submit. 4. The DFU Workunit Detail page displays. Click the link labeled "View Progress" if you wish to watch the progress of the file spray. The file spray will be complete after a few minutes. */ IMPORT std; UCase := std.Str.ToUpperCase; /* ********************************************* Data Declarations: ********************************************* */ // The NHTSA has documentation that describes the layout for the Complaints file. See // http://www-odi.nhtsa.dot.gov/downloads/folders/Complaints/CMPL.txt . Based on this // documentation, I created the following record definition for the file: layout_in_NHTSA_complaints := RECORD, MAXLENGTH(2822) STRING9 CMPLID; STRING9 ODINO; STRING40 MFR_NAME; STRING25 MAKETXT; STRING256 MODELTXT; STRING4 YEARTXT; STRING1 CRASH; STRING8 FAILDATE; STRING1 FIRE; UNSIGNED2 INJURED; UNSIGNED2 DEATHS; STRING128 COMPDESC; STRING30 CITY; STRING2 STATE; STRING11 VIN; STRING8 DATEA; STRING8 LDATE; UNSIGNED7 MILES; UNSIGNED4 OCCURENCES; STRING2048 CDESCR; STRING4 CMPL_TYPE; STRING1 POLICE_RPT_YN; STRING8 PURCH_DT; STRING1 ORIG_OWNER_YN; STRING1 ANTI_BRAKES_YN; STRING1 CRUISE_CONT_YN; UNSIGNED2 NUM_CYLS; STRING4 DRIVE_TRAIN; STRING4 FUEL_SYS; STRING4 FUEL_TYPE; STRING4 TRANS_TYPE; UNSIGNED3 VEH_SPEED; STRING20 DOT; STRING30 TIRE_SIZE; STRING4 LOC_OF_TIRE; STRING4 TIRE_FAIL_TYPE; STRING1 ORIG_EQUIP_YN; STRING8 MANUF_DT; STRING4 SEAT_TYPE; STRING4 RESTRAINT_TYPE; STRING40 DEALER_NAME; STRING20 DEALER_TEL; STRING30 DEALER_CITY; STRING2 DEALER_STATE; STRING10 DEALER_ZIP; STRING4 PROD_TYPE; STRING1 REPAIRED_YN; END; // ...and based on it, declared the following dataset: ds_NHTSA_complaints := DATASET( '~mythor::ecldata::in::nhtsa_complaints', layout_in_NHTSA_complaints, CSV( HEADING(0), SEPARATOR('\t'), TERMINATOR('\n'), MAXLENGTH(20000) ) ); // I used a really high MAXLENGTH because, although the documentation states // that the CDESCR field is at most 2048 kB, this limit was frequently exceeded. // By a lot. // I decided to slim this dataset down, since fields 21-47 were added in // late 2007 and they're pretty sparse anyway. Also, I wanted to perform // some basic data conversion and clarify some cryptic field names. The // result of this would normally be output to disk as a 'base file': layout_base_NHTSA_complaints := RECORD, MAXLENGTH(2591) STRING9 complaint_id; STRING9 case_ref_num; STRING40 mfr_name; STRING25 make; STRING256 model; STRING4 model_year; STRING1 in_crash; UNSIGNED4 date_incident; STRING1 in_fire; UNSIGNED2 num_injured; UNSIGNED2 num_deaths; STRING128 component_desc; STRING30 consumer_city; STRING2 consumer_state_cd; STRING11 vin; UNSIGNED4 date_added; UNSIGNED4 date_complaint; UNSIGNED7 veh_miles; UNSIGNED4 num_occurrences; STRING2048 complaint_desc; END; layout_base_NHTSA_complaints xfm_to_base_file(layout_in_NHTSA_complaints le) := TRANSFORM SELF.complaint_id := le.CMPLID; SELF.case_ref_num := le.ODINO; SELF.mfr_name := le.MFR_NAME; SELF.make := le.MAKETXT; SELF.model := le.MODELTXT; SELF.model_year := le.YEARTXT; SELF.in_crash := le.CRASH; SELF.date_incident := (UNSIGNED4)le.FAILDATE; SELF.in_fire := le.FIRE; SELF.num_injured := le.INJURED; SELF.num_deaths := le.DEATHS; SELF.component_desc := le.COMPDESC; SELF.consumer_city := le.CITY; SELF.consumer_state_cd := le.STATE; SELF.vin := le.VIN; SELF.date_added := (UNSIGNED4)le.DATEA; SELF.date_complaint := (UNSIGNED4)le.LDATE; SELF.veh_miles := le.MILES; SELF.num_occurrences := le.OCCURENCES; SELF.complaint_desc := le.CDESCR; END; ds_NHTSA_complaints_slim := PROJECT( ds_NHTSA_complaints, xfm_to_base_file(LEFT) ) : INDEPENDENT; // For one of the queries below, remove 'mostly' duplicate records; each // complaint_id is unique; but, each case_ref_num occurs 1-M times. Usually, // many occurrences of a case_ref_num reveals different components failing // in the same incident. So, the records in this case are identical except // for the component_desc values. complaints_dist := DISTRIBUTE( ds_NHTSA_complaints_slim, HASH32(case_ref_num) ); ds_NHTSA_complaints_ddpd := DEDUP( SORT( complaints_dist, case_ref_num, make, LOCAL ), case_ref_num, make, LOCAL ); /* ********************************************* Using the data: ********************************************* */ // The NHTSA Complaints data is actually pretty interesting, and I have provided // three queries, below. // *** Common Attributes *** // The following macro allows for some very basic, literal string searching // on any field in the NHTSA base file. You can change the dataset, the field // to search against, and/or the search terms. mac_search_file( cmplnts_file, fieldnm, searchexp, results ) := MACRO #UNIQUENAME(searchexp_uppercase) #UNIQUENAME(complaints_file) #UNIQUENAME(searchexpression) %searchexp_uppercase% := UCase(searchexp); %complaints_file% := cmplnts_file; PATTERN %searchexpression% := [ %searchexp_uppercase% ]; results := PARSE( %complaints_file%, fieldnm, %searchexpression%, TRANSFORM(LEFT), NOSCAN, NOCASE ); ENDMACRO; // *** Query1: *** // // Output all records whose complaint_desc field contains the expression 'daughter'. // Use a CHOOSEN to limit the number of results, as they exceed the maximum amount // of memory allowed. mac_search_file(ds_NHTSA_complaints_ddpd, complaint_desc, 'daughter', complaint_search_results) OUTPUT( CHOOSEN(complaint_search_results, 100), NAMED('complaint_search_results') ); // *** Query 2: *** // // The following query shows how many A/C problems were reported throughout year // historically to the NHTSA. We can see some different numbers according to what // time of year it is, which is what we would expect. // First obtain a dataset of all records having the search expression. mac_search_file(ds_NHTSA_complaints_slim, component_desc, 'air cond', component_search_result); // Add a separate field and parse the complaint date to get the month. results_with_month_reported := PROJECT( component_search_result(date_complaint != 0), TRANSFORM( {layout_base_NHTSA_complaints, STRING2 month}, SELF.month := ( (STRING8)LEFT.date_complaint )[5..6], SELF := LEFT ) ); // Group by month and count each group. tbl_num_complaints_by_month := TABLE( results_with_month_reported, { month, cnt := COUNT(GROUP) }, month ); // Sort and output. tbl_num_complaints_sorted := SORT( tbl_num_complaints_by_month, (UNSIGNED)month ); OUTPUT( tbl_num_complaints_sorted, NAMED('num_complaints_by_month') ); // *** Query 3: *** // // The following query displays the top 5 automobile components having problems // as reported by consumers for a particular make, model, start model year, and // end model year. This information could be helpful when shopping for a particular // automobile. car_manufacturers := DATASET( [ {'ABARTH'} , {'ACURA'} , {'AMC'} , {'ALFA ROMEO'}, {'ASTON MARTIN'}, {'AUDI'} , {'AUSTIN'} , {'AUTHI'} , {'BARREIROS'} , {'BAOLONG'}, {'BENTLEY'} , {'BISCUTER'} , {'BMC'} , {'BMW'} , {'BUGATTI'}, {'BUICK'} , {'CADILLAC'} , {'CHANA'} , {'CHANGCHENG'}, {'CHANGFENG'}, {'CHERY'} , {'CHEVROLET'}, {'CHRYSLER'}, {'DAIMLER AG'}, {'DAEWOO'}, {'DAIHATSU'}, {'DAIMLER'} , {'CITROEN'} , {'DAIMLER-AG'}, {'DELOREAN'}, {'DMC'} , {'DODGE'} , {'DONGFENG'}, {'EAGLE'} , {'ELIZALDE'}, {'ENASA'} , {'EUCORT'} , {'FERRARI'} , {'FIAT'} , {'FISKER'}, {'FORD'} , {'GM'} , {'GMC'} , {'GEO'} , {'HAIMA'}, {'HOLDEN'} , {'HONDA'} , {'HONGQI'} , {'HUALI'} , {'HUMMER'}, {'HYUNDAI'} , {'INFINITI'} , {'ISUZU'} , {'LAND ROVER'}, {'JEEP'}, {'KIA'} , {'LANCIA'} , {'JAGUAR'} , {'LANDROVER'} , {'LAMBORGHINI'}, {'LEXUS'} , {'LINCOLN'} , {'LOTUS'} , {'MAHINDRA'} , {'MARCOS'}, {'MASERATI'}, {'MAYBACH'} , {'MAZDA'} , {'MCLAREN'} , {'MERCEDES BENZ'}, {'MERCURY'} , {'MG'} , {'MINI'} , {'MITSUBISHI'}, {'MERCEDES-BENZ'}, {'MITSUOKA'}, {'NAVISTAR'} , {'NISSAN'} , {'OLDSMOBILE'}, {'OPEL'}, {'PAGANI'} , {'PANOZ'} , {'PROTON'} , {'PACCAR DAF'}, {'PININFARINA'}, {'PEUGEOT'} , {'PLYMOUTH'} , {'PONTIAC'} , {'PORSCHE'} , {'ROLLS ROYCE'}, {'QIRUI'} , {'RAM'} , {'RENAULT'} , {'PACCAR-DAF'}, {'ROLLS-ROYCE'}, {'ROVER'} , {'SAAB'} , {'SATURN'} , {'SCION'} , {'SEAT'}, {'SKODA'} , {'SMART'} , {'SPYKER'} , {'SSANGYONG'} , {'STERLING'}, {'SUBARU'} , {'SUZUKI'} , {'TATA'} , {'TESLA'} , {'TOYOTA'}, {'TRIUMPH'} , {'VAUXHALL'} , {'VOLVO'} , {'WULING'} , {'VOLKSWAGEN'}, {'YUGO'} ], {STRING18 mfr} ); // We want car manufacturers only, so join to the lookup table defined above. complaints_cars_only := JOIN( ds_NHTSA_complaints_slim, car_manufacturers, LEFT.make = RIGHT.mfr, TRANSFORM(LEFT), INNER, MANY LOOKUP ); // The following function filters the car-complaints dataset by make, model, as well // as start and end model years. Note the filter tries to account for one or more // missing parameters and still return a result that is useful. fn_list_top_complaints( STRING25 s_make = '', STRING256 s_model = '', STRING4 mdl_yr_start = '', STRING4 mdl_yr_end = '' ) := FUNCTION complaints_filtered := complaints_cars_only( (make = UCase(s_make) OR s_make = '') AND (model = UCase(s_model) OR s_model = '') AND ((UNSIGNED2)model_year >= (UNSIGNED2)mdl_yr_start OR mdl_yr_start = '') AND ((UNSIGNED2)model_year <= (UNSIGNED2)mdl_yr_end OR mdl_yr_end = '') ); tbl_complaints_by_component := TABLE( complaints_filtered, { make, model, component_desc, cnt := COUNT(GROUP) }, make, model, component_desc ); tbl_complaints_top5 := TOPN( UNGROUP(tbl_complaints_by_component), 5, make, model, -cnt ); RETURN tbl_complaints_top5; END; top5_complaints_by_component := fn_list_top_complaints('FORD','EXPLORER','1995','2007'); OUTPUT( top5_complaints_by_component, NAMED('top5_complaints_by_component'), ALL ); /* END */