/* *************************************************************** Kenneth J. Hill kenneth.hill@lexisnexis.com The Heritage Provider Network is sponsoring a contest whose goal is to develop a predictive algorithm that can identify patients who will be admitted to the hospital within the next year, using historical claims data. This contest is open until April of 2013. Full information regarding the contest is available at: http://www.heritagehealthprize.com By registering to compete in the contest, you will be given access to the contest data which consists of claims data, training data, and member information. The file is supplied as a single zipped file of about 49MB in size. The unzipped contents of the file are: Claims.csv DaysInHospital_Y2.csv DaysInHospital_Y3.csv Lookup PrimaryConditionGroup.csv Lookup ProcedureGroup.csv Members.csv Target.csv This example walks through the process of joining some of the files together, collecting some statistics about the files, and finally preparing a file that can be submitted to the contest for scoring. The first step will be to load the files into the HPCC. To do this you will need to copy the following files to your landing zone: Claims.csv DaysInHospital_Y2.csv DaysInHospital_Y3.csv Members.csv Target.csv Once you have the files in your landing zone, you will need to spray them to the HPCC using the Ecl Watch/Spray CSV function. This example uses the following names when importing the data: '~thor::in::hhcp::members.csv' '~thor::in::hhcp::claims.csv' '~thor::in::hhcp::daysinhospital_y2.csv' '~thor::in::hhcp::daysinhospital_y3.csv' '~thor::in::hhcp::target.csv' Once you have the data imported, we can begin the processing. *************************************************************** */ // --------------------------------------------------------------- // We will begin by creating definitions of our input files. // The members file contains information about each person in the data sets. // The files each contain a header line so we will use the 'CSV(HEADING(1))' // option to exclude the first line from the datasets. member_record := RECORD STRING8 MemberID; STRING8 Age; STRING1 Sex; end; file_members := DATASET('~thor::in::hhcp::members.csv', member_record , CSV(HEADING(1)) ); // The claims file contains information regarding all claims filed by individuals // in the members file for years one, two, and three. This file contains about // 2.5 million records and is about 250MB in size. claims_record := RECORD STRING8 MemberID; STRING8 ProviderID; STRING8 Vendor; STRING8 PCP; STRING2 Year; STRING32 Specialty; STRING32 PlaceSvc; STRING12 PayDelay; STRING8 LengthOfStay; STRING8 DSFS; STRING8 PrimaryConditionGroup; STRING8 CharlsonIndex; STRING8 ProcedureGroup; STRING8 SupLOS; end; file_claims := DATASET('~thor::in::hhcp::claims.csv',claims_record , CSV(HEADING(1)) ); // The days in hospital files contain information about how many days the member // spent in the hospital in the corresponding reporting period. This information // can be used for training a prediction algorithm. dih_record := RECORD STRING8 MemberID; integer1 ClaimsTruncated; integer1 DaysInHospital; end; file_y2DIH := DATASET('~thor::in::hhcp::daysinhospital_y2.csv',dih_record , CSV(HEADING(1)) ); file_y3DIH := DATASET('~thor::in::hhcp::daysinhospital_y3.csv',dih_record , CSV(HEADING(1)) ); // Now the work begins! Our first goal will be to merge some interesting information // together for each member. For this example we will build an extended member record // which contains the member_record and a count of all Claims as well as the // subcount of claims for each of the years in the file. We will also add in the // days in hosptal values. This record will serve as the input for our modeling // algorithm. // In this definition we use the the ability of RECORD to inherit from another record // by using the 'RECORD(member_record)' syntax. // Here we build out a new data set called extendedmembers which consists of the // base member_record plus some summary information and the training information. extended_member_record := RECORD(member_record) integer1 ClaimCount; integer1 y1Claims; integer1 y2Claims; integer1 y3Claims; integer1 y2DaysInHospital; integer1 y3DaysInHospital; end; extendedmembers := PROJECT(file_members, TRANSFORM(extended_member_record, SELF:=LEFT; SELF:=[];)); // Since the 'members.csv' to 'claims.csv' has a one to many relationship we need to // denormalize the data. This allows us to generate a summary // of all claims information for each member. The summary we will be using will // be simply a count of claims for each year as well as a count of all claims // present for the member. // Here we produce another dataset called denormed_claims which contains the // members plus their summary claim information. extended_member_record d(extended_member_record le, claims_record ri, integer C) := TRANSFORM self.ClaimCount := C; self.y1Claims := IF(ri.Year='Y1', le.y1Claims + 1, le.y1Claims); self.y2Claims := IF(ri.Year='Y2', le.y2Claims + 1, le.y2Claims); self.y3Claims := IF(ri.Year='Y3', le.y3Claims + 1, le.y3Claims); self := le; end; denormed_claims := DENORMALIZE(extendedmembers, file_claims, LEFT.MemberID=RIGHT.MemberID, d(LEFT,RIGHT,COUNTER) ); // The next two transforms capture the Days in Hospital information for // years one and two. We use these to produce our final exended_members // dataset which contains all of the information we are interested in. // We use a LEFT OUTER join because we want one output record for // every record in the left dataset. extended_member_record dih2(extended_member_record le, dih_record ri) := TRANSFORM self.y2DaysInHospital := ri.DaysInHospital; self := le; end; extended_member_record dih3(extended_member_record le, dih_record ri) := TRANSFORM self.y3DaysInHospital := ri.DaysInHospital; self := le; end; dih := JOIN(denormed_claims, file_y2DIH, LEFT.MemberID=RIGHT.MemberID, dih2(LEFT, RIGHT), LEFT OUTER); extended_members := JOIN(dih, file_y3DIH, LEFT.MemberID=RIGHT.MemberID, dih3(LEFT, RIGHT), LEFT OUTER); // ---------------------------------------------------------- // Now that we have the extended_members dataset defined // we can use it to generate some information... // First we will define a record structure than can be used // as an output format. // This will allow us to output the information in a readable manner. out_record := RECORD integer4 total_members; integer4 total_extended_members; integer4 males; integer4 females; integer4 total_claimcount; integer4 claims_y1; integer4 claims_y2; integer4 claims_y3; integer4 filelen_daysinhospital_y2; integer4 filelen_daysinhospital_y3; end; // Now we populate an outrecord with the info we are interested in... out := DATASET( [{ count(file_members), // count of file_members must equal count of extended members -- other wise there is a bug in our code! count(extended_members), count(extended_members(Sex='M')), count(extended_members(Sex='F')), count(file_claims), count(file_claims(Year='Y1')), count(file_claims(Year='Y2')), count(file_claims(Year='Y3')), count(file_y2DIH), count(file_y3DIH), }], out_record); // And here is the actual output output(out); // ---------------------------------------------------------- // The next section will set days in hospital for the target members and write // an output file that is ready for submission to the contest. // We need to predict days in hospital year four for for members in the target.csv // file. This file has the same format as the other daysInHospital files so // we will use the dih_record definition again. file_target := DATASET('~thor::in::hhcp::target.csv',dih_record , CSV(HEADING(1)) ); // One contest requirement is that the submission file contains the members in // the same order as the target file so we will add a sequence number that // can be used to order the output when we are done. dih_seq_record := RECORD(dih_record) integer seq; end; sequenced_target := PROJECT(file_target, TRANSFORM( dih_seq_record, SELF.seq := COUNTER; self:=LEFT;)); // The score transform will calculate the days in hospital when we join the target dataset // with the extended_members dataset derived above. // For this example we will use a very simple algorithm that predicts // year 4 days in hospital as an average of year 2 and year 3 IF year 3 claims // are greater than or equal to year 2 claims. dih_seq_record score(dih_seq_record le, extended_member_record ri) := TRANSFORM self.DaysInHospital := IF(ri.y3Claims >= ri.y2Claims, (ri.y2DaysInHospital + ri.y3DaysInHospital)/2, 0 ); self := le; end; // Here we bring it all together and apply the transform to all of the target records // We use a LEFT OUTER join to obtain a single row out for each row in the left // datasource. targetout := JOIN(sequenced_target, extended_members, LEFT.MemberID=RIGHT.MemberID, score(LEFT,RIGHT), LEFT OUTER); // The final result is sorted on sequence number to preserve the target order // and written out in the correct format to a file called results.csv. output(SORT(targetout, seq), {MemberID, ClaimsTruncated, DaysInHospital},'~thor::out::hhpc::results.csv', CSV(HEADING(SINGLE)), OVERWRITE); /* *************************************************************** The results.csv file can be copied out of the HPCC by using the 'despray' option on the ECL Watch / Browse files page. Once the file is desprayed it can be copied out of your landing zone and uploaded to the Heritage Healthcare prize submission page. *************************************************************** */