# make sure the package is installed --> devtools::install_github("hestiri/mlho")
library(mlho)

Install and load the required packages:

if(!require(pacman)) install.packages("pacman")
#> Loading required package: pacman

pacman::p_load(data.table, devtools, backports, Hmisc, tidyr,dplyr,ggplot2,plyr,scales,readr,
               httr, DT, lubridate, tidyverse,reshape2,foreach,doParallel,caret,gbm,lubridate,praznik)

MLHO Vanilla implementation

Here we go through the vanilla implementation of MLHO using the synthetic data provided in the package.

The synthetic data syntheticmass downloaded and prepared according to MLHO input data model from SyntheticMass, generated by SyntheaTM, an open-source patient population simulation made available by The MITRE Corporation.

data("syntheticmass")

here’s how dbmart table looks (there’s an extra column that is the translation of the phenx concepts):

head(dbmart)
#>                            patient_num     phenx
#> 1 8d4c4326-e9de-4f45-9a4c-f8c36bff89ae 169553002
#> 2 10339b10-3cd1-4ac3-ac13-ec26728cb592 430193006
#> 3 f5dcd418-09fe-4a2f-baa0-3da800bd8c3a 430193006
#> 4 f5dcd418-09fe-4a2f-baa0-3da800bd8c3a 117015009
#> 5 f5dcd418-09fe-4a2f-baa0-3da800bd8c3a 117015009
#> 6 f5dcd418-09fe-4a2f-baa0-3da800bd8c3a 430193006
#>                               DESCRIPTION          start_date
#> 1 Insertion of subcutaneous contraceptive 2011-04-30 00:26:23
#> 2   Medication Reconciliation (procedure) 2010-07-27 12:58:08
#> 3   Medication Reconciliation (procedure) 2010-11-20 03:04:34
#> 4              Throat culture (procedure) 2011-02-07 03:04:34
#> 5              Throat culture (procedure) 2011-04-19 03:04:34
#> 6   Medication Reconciliation (procedure) 2011-11-26 03:04:34

our demographic table contains: patient_num, white, black, hispanic, sex_cd.

##split data into train-test

uniqpats <- c(as.character(unique(dbmart$patient_num)))
#using a 70-30 ratio
test_ind <- sample(uniqpats,
                   round(.3*length(uniqpats)))

test_labels <- subset(labeldt,labeldt$patient_num %in% c(test_ind))
print("test set lables:")
#> [1] "test set lables:"
table(test_labels$label)
#> 
#>   0   1 
#> 261  90
train_labels <- subset(labeldt,!(labeldt$patient_num %in% c(test_ind)))
print("train set lables:")
#> [1] "train set lables:"
table(train_labels$label)
#> 
#>   0   1 
#> 593 227
# train and test sets 
dat.train  <- subset(dbmart,!(dbmart$patient_num %in% c(test_ind)))
dat.test <- subset(dbmart,dbmart$patient_num %in% c(test_ind))

now dimensionality reduction on training set ## MSMR lite from here, we will split the data into a train and a test set and apply MSMR.lite to the training data

data.table::setDT(dat.train)
dat.train[,row := .I]
dat.train$value.var <- 1
uniqpats.train <- c(as.character(unique(dat.train$patient_num)))

##here is the application of MSMR.lite
dat.train <- MSMSR.lite(MLHO.dat=dat.train,
                        patients = uniqpats.train,
                        sparsity=0.005, 
                        labels = labeldt,
                        topn=200)
#> [1] "step - 1: sparsity screening!"
#> [1] "step 2: JMI dimensionality reduction!"

Notice that we are removing concepts that had prevalence less than 0.5% and then only taking the top 200 after the JMI rankings. See help (?mlho::MSMSR.lite) for MSMR.lite parameters.

Now we have the training data with the top 200 features, to which we can add demographic features. Now on to prepping the test set:

dat.test <- subset(dat.test,dat.test$phenx %in% colnames(dat.train))
setDT(dat.test)
dat.test[,row := .I]
dat.test$value.var <- 1
uniqpats.test <- c(as.character(unique(dat.test$patient_num)))

dat.test <- MSMSR.lite(MLHO.dat=dat.test,patients = uniqpats.test,sparsity=NA,jmi = FALSE,labels = labeldt)

Here notice that we only used MSMR.lite to generate the wide table that matches to the dat.train table.

##Modeling we will use the mlearn function to do the modeling, which includes training the model and testing it on the test set.

model.test <- mlearn(dat.train,
                   dat.test,
                   dems=dems,
                   save.model=FALSE,
                   classifier="gbm",
                   note="mlho_terst_run",
                   cv="cv",
                   nfold=5,
                   aoi="prediabetes",
                   multicore=FALSE)
#> [1] "the modeling!"
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0721             nan     0.1000    0.0528
#>      2        0.9872             nan     0.1000    0.0390
#>      3        0.9256             nan     0.1000    0.0316
#>      4        0.8717             nan     0.1000    0.0266
#>      5        0.8291             nan     0.1000    0.0205
#>      6        0.7951             nan     0.1000    0.0169
#>      7        0.7667             nan     0.1000    0.0140
#>      8        0.7457             nan     0.1000    0.0121
#>      9        0.7256             nan     0.1000    0.0100
#>     10        0.7085             nan     0.1000    0.0085
#>     20        0.6163             nan     0.1000    0.0024
#>     40        0.5363             nan     0.1000    0.0005
#>     60        0.4973             nan     0.1000    0.0003
#>     80        0.4702             nan     0.1000   -0.0000
#>    100        0.4476             nan     0.1000   -0.0013
#>    120        0.4262             nan     0.1000   -0.0008
#>    140        0.4135             nan     0.1000   -0.0011
#>    150        0.4063             nan     0.1000   -0.0004
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0582             nan     0.1000    0.0554
#>      2        0.9733             nan     0.1000    0.0430
#>      3        0.9079             nan     0.1000    0.0332
#>      4        0.8545             nan     0.1000    0.0272
#>      5        0.8045             nan     0.1000    0.0230
#>      6        0.7650             nan     0.1000    0.0183
#>      7        0.7382             nan     0.1000    0.0129
#>      8        0.7121             nan     0.1000    0.0124
#>      9        0.6852             nan     0.1000    0.0113
#>     10        0.6631             nan     0.1000    0.0099
#>     20        0.5478             nan     0.1000    0.0025
#>     40        0.4538             nan     0.1000    0.0009
#>     60        0.4097             nan     0.1000   -0.0009
#>     80        0.3759             nan     0.1000   -0.0003
#>    100        0.3531             nan     0.1000   -0.0010
#>    120        0.3317             nan     0.1000   -0.0000
#>    140        0.3187             nan     0.1000   -0.0002
#>    150        0.3119             nan     0.1000   -0.0018
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0704             nan     0.1000    0.0601
#>      2        0.9735             nan     0.1000    0.0469
#>      3        0.9016             nan     0.1000    0.0342
#>      4        0.8454             nan     0.1000    0.0297
#>      5        0.7962             nan     0.1000    0.0226
#>      6        0.7509             nan     0.1000    0.0196
#>      7        0.7127             nan     0.1000    0.0163
#>      8        0.6819             nan     0.1000    0.0150
#>      9        0.6553             nan     0.1000    0.0111
#>     10        0.6284             nan     0.1000    0.0106
#>     20        0.5026             nan     0.1000    0.0021
#>     40        0.4142             nan     0.1000   -0.0004
#>     60        0.3641             nan     0.1000   -0.0001
#>     80        0.3306             nan     0.1000   -0.0014
#>    100        0.3033             nan     0.1000   -0.0012
#>    120        0.2817             nan     0.1000   -0.0011
#>    140        0.2661             nan     0.1000   -0.0007
#>    150        0.2562             nan     0.1000   -0.0007
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0637             nan     0.1000    0.0571
#>      2        0.9826             nan     0.1000    0.0414
#>      3        0.9181             nan     0.1000    0.0333
#>      4        0.8659             nan     0.1000    0.0269
#>      5        0.8190             nan     0.1000    0.0227
#>      6        0.7827             nan     0.1000    0.0185
#>      7        0.7500             nan     0.1000    0.0149
#>      8        0.7284             nan     0.1000    0.0132
#>      9        0.7068             nan     0.1000    0.0106
#>     10        0.6884             nan     0.1000    0.0091
#>     20        0.6033             nan     0.1000    0.0000
#>     40        0.5316             nan     0.1000   -0.0006
#>     60        0.4914             nan     0.1000    0.0001
#>     80        0.4715             nan     0.1000   -0.0004
#>    100        0.4504             nan     0.1000   -0.0004
#>    120        0.4302             nan     0.1000   -0.0010
#>    140        0.4173             nan     0.1000    0.0004
#>    150        0.4100             nan     0.1000   -0.0005
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0663             nan     0.1000    0.0589
#>      2        0.9718             nan     0.1000    0.0438
#>      3        0.8973             nan     0.1000    0.0342
#>      4        0.8388             nan     0.1000    0.0288
#>      5        0.7906             nan     0.1000    0.0229
#>      6        0.7500             nan     0.1000    0.0187
#>      7        0.7174             nan     0.1000    0.0166
#>      8        0.6879             nan     0.1000    0.0130
#>      9        0.6642             nan     0.1000    0.0093
#>     10        0.6449             nan     0.1000    0.0089
#>     20        0.5255             nan     0.1000    0.0016
#>     40        0.4490             nan     0.1000    0.0005
#>     60        0.4075             nan     0.1000   -0.0018
#>     80        0.3806             nan     0.1000   -0.0011
#>    100        0.3605             nan     0.1000   -0.0012
#>    120        0.3400             nan     0.1000    0.0001
#>    140        0.3246             nan     0.1000   -0.0010
#>    150        0.3184             nan     0.1000   -0.0006
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0537             nan     0.1000    0.0615
#>      2        0.9639             nan     0.1000    0.0448
#>      3        0.8878             nan     0.1000    0.0345
#>      4        0.8277             nan     0.1000    0.0284
#>      5        0.7796             nan     0.1000    0.0246
#>      6        0.7364             nan     0.1000    0.0175
#>      7        0.6977             nan     0.1000    0.0160
#>      8        0.6659             nan     0.1000    0.0138
#>      9        0.6410             nan     0.1000    0.0107
#>     10        0.6206             nan     0.1000    0.0094
#>     20        0.4928             nan     0.1000    0.0022
#>     40        0.4071             nan     0.1000   -0.0004
#>     60        0.3632             nan     0.1000   -0.0015
#>     80        0.3319             nan     0.1000   -0.0012
#>    100        0.3091             nan     0.1000   -0.0004
#>    120        0.2881             nan     0.1000   -0.0018
#>    140        0.2698             nan     0.1000   -0.0006
#>    150        0.2631             nan     0.1000   -0.0011
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0661             nan     0.1000    0.0587
#>      2        0.9818             nan     0.1000    0.0440
#>      3        0.9120             nan     0.1000    0.0333
#>      4        0.8590             nan     0.1000    0.0283
#>      5        0.8127             nan     0.1000    0.0226
#>      6        0.7692             nan     0.1000    0.0181
#>      7        0.7391             nan     0.1000    0.0159
#>      8        0.7148             nan     0.1000    0.0132
#>      9        0.6930             nan     0.1000    0.0115
#>     10        0.6754             nan     0.1000    0.0097
#>     20        0.5846             nan     0.1000    0.0010
#>     40        0.5066             nan     0.1000    0.0012
#>     60        0.4652             nan     0.1000   -0.0002
#>     80        0.4353             nan     0.1000    0.0002
#>    100        0.4179             nan     0.1000    0.0002
#>    120        0.4029             nan     0.1000   -0.0003
#>    140        0.3899             nan     0.1000   -0.0020
#>    150        0.3820             nan     0.1000   -0.0005
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0657             nan     0.1000    0.0618
#>      2        0.9775             nan     0.1000    0.0436
#>      3        0.9004             nan     0.1000    0.0362
#>      4        0.8380             nan     0.1000    0.0272
#>      5        0.7841             nan     0.1000    0.0232
#>      6        0.7429             nan     0.1000    0.0193
#>      7        0.7055             nan     0.1000    0.0175
#>      8        0.6779             nan     0.1000    0.0139
#>      9        0.6548             nan     0.1000    0.0111
#>     10        0.6321             nan     0.1000    0.0103
#>     20        0.5101             nan     0.1000    0.0031
#>     40        0.4295             nan     0.1000   -0.0018
#>     60        0.3870             nan     0.1000   -0.0016
#>     80        0.3540             nan     0.1000   -0.0010
#>    100        0.3345             nan     0.1000   -0.0014
#>    120        0.3137             nan     0.1000   -0.0008
#>    140        0.2966             nan     0.1000   -0.0008
#>    150        0.2925             nan     0.1000   -0.0017
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0526             nan     0.1000    0.0627
#>      2        0.9570             nan     0.1000    0.0461
#>      3        0.8797             nan     0.1000    0.0358
#>      4        0.8154             nan     0.1000    0.0290
#>      5        0.7688             nan     0.1000    0.0229
#>      6        0.7236             nan     0.1000    0.0203
#>      7        0.6853             nan     0.1000    0.0170
#>      8        0.6547             nan     0.1000    0.0141
#>      9        0.6281             nan     0.1000    0.0108
#>     10        0.6053             nan     0.1000    0.0112
#>     20        0.4740             nan     0.1000    0.0025
#>     40        0.3882             nan     0.1000   -0.0010
#>     60        0.3390             nan     0.1000   -0.0009
#>     80        0.3045             nan     0.1000   -0.0032
#>    100        0.2772             nan     0.1000   -0.0015
#>    120        0.2573             nan     0.1000   -0.0006
#>    140        0.2402             nan     0.1000   -0.0009
#>    150        0.2326             nan     0.1000   -0.0011
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0700             nan     0.1000    0.0551
#>      2        0.9756             nan     0.1000    0.0452
#>      3        0.9095             nan     0.1000    0.0353
#>      4        0.8497             nan     0.1000    0.0264
#>      5        0.8068             nan     0.1000    0.0220
#>      6        0.7691             nan     0.1000    0.0191
#>      7        0.7380             nan     0.1000    0.0159
#>      8        0.7091             nan     0.1000    0.0134
#>      9        0.6861             nan     0.1000    0.0107
#>     10        0.6662             nan     0.1000    0.0090
#>     20        0.5802             nan     0.1000    0.0028
#>     40        0.5065             nan     0.1000   -0.0014
#>     60        0.4675             nan     0.1000   -0.0002
#>     80        0.4420             nan     0.1000   -0.0013
#>    100        0.4206             nan     0.1000   -0.0009
#>    120        0.4011             nan     0.1000    0.0000
#>    140        0.3862             nan     0.1000   -0.0003
#>    150        0.3812             nan     0.1000   -0.0005
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0538             nan     0.1000    0.0572
#>      2        0.9610             nan     0.1000    0.0461
#>      3        0.8893             nan     0.1000    0.0353
#>      4        0.8345             nan     0.1000    0.0286
#>      5        0.7801             nan     0.1000    0.0264
#>      6        0.7425             nan     0.1000    0.0198
#>      7        0.7053             nan     0.1000    0.0184
#>      8        0.6706             nan     0.1000    0.0153
#>      9        0.6487             nan     0.1000    0.0112
#>     10        0.6257             nan     0.1000    0.0108
#>     20        0.5125             nan     0.1000    0.0006
#>     40        0.4355             nan     0.1000   -0.0024
#>     60        0.3941             nan     0.1000   -0.0019
#>     80        0.3611             nan     0.1000   -0.0014
#>    100        0.3349             nan     0.1000   -0.0016
#>    120        0.3154             nan     0.1000   -0.0013
#>    140        0.3010             nan     0.1000   -0.0014
#>    150        0.2949             nan     0.1000   -0.0016
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0573             nan     0.1000    0.0621
#>      2        0.9624             nan     0.1000    0.0463
#>      3        0.8811             nan     0.1000    0.0383
#>      4        0.8171             nan     0.1000    0.0292
#>      5        0.7685             nan     0.1000    0.0231
#>      6        0.7211             nan     0.1000    0.0225
#>      7        0.6805             nan     0.1000    0.0178
#>      8        0.6491             nan     0.1000    0.0132
#>      9        0.6232             nan     0.1000    0.0113
#>     10        0.5994             nan     0.1000    0.0100
#>     20        0.4711             nan     0.1000    0.0016
#>     40        0.3922             nan     0.1000   -0.0015
#>     60        0.3461             nan     0.1000   -0.0017
#>     80        0.3091             nan     0.1000   -0.0003
#>    100        0.2822             nan     0.1000   -0.0018
#>    120        0.2608             nan     0.1000   -0.0008
#>    140        0.2445             nan     0.1000   -0.0005
#>    150        0.2362             nan     0.1000   -0.0016
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0619             nan     0.1000    0.0582
#>      2        0.9711             nan     0.1000    0.0456
#>      3        0.8971             nan     0.1000    0.0364
#>      4        0.8401             nan     0.1000    0.0275
#>      5        0.7954             nan     0.1000    0.0231
#>      6        0.7565             nan     0.1000    0.0182
#>      7        0.7253             nan     0.1000    0.0158
#>      8        0.7033             nan     0.1000    0.0126
#>      9        0.6768             nan     0.1000    0.0104
#>     10        0.6579             nan     0.1000    0.0091
#>     20        0.5657             nan     0.1000    0.0012
#>     40        0.4950             nan     0.1000    0.0005
#>     60        0.4596             nan     0.1000   -0.0005
#>     80        0.4294             nan     0.1000    0.0001
#>    100        0.4054             nan     0.1000    0.0001
#>    120        0.3888             nan     0.1000   -0.0009
#>    140        0.3740             nan     0.1000   -0.0002
#>    150        0.3658             nan     0.1000    0.0001
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0620             nan     0.1000    0.0646
#>      2        0.9669             nan     0.1000    0.0490
#>      3        0.8863             nan     0.1000    0.0368
#>      4        0.8233             nan     0.1000    0.0290
#>      5        0.7734             nan     0.1000    0.0228
#>      6        0.7339             nan     0.1000    0.0191
#>      7        0.6984             nan     0.1000    0.0174
#>      8        0.6695             nan     0.1000    0.0145
#>      9        0.6451             nan     0.1000    0.0116
#>     10        0.6222             nan     0.1000    0.0110
#>     20        0.5025             nan     0.1000    0.0018
#>     40        0.4242             nan     0.1000   -0.0015
#>     60        0.3766             nan     0.1000   -0.0007
#>     80        0.3408             nan     0.1000   -0.0019
#>    100        0.3161             nan     0.1000   -0.0005
#>    120        0.2965             nan     0.1000   -0.0015
#>    140        0.2808             nan     0.1000   -0.0009
#>    150        0.2742             nan     0.1000   -0.0008
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0535             nan     0.1000    0.0649
#>      2        0.9562             nan     0.1000    0.0474
#>      3        0.8786             nan     0.1000    0.0371
#>      4        0.8181             nan     0.1000    0.0291
#>      5        0.7646             nan     0.1000    0.0239
#>      6        0.7176             nan     0.1000    0.0214
#>      7        0.6807             nan     0.1000    0.0160
#>      8        0.6481             nan     0.1000    0.0136
#>      9        0.6180             nan     0.1000    0.0126
#>     10        0.5950             nan     0.1000    0.0095
#>     20        0.4681             nan     0.1000    0.0015
#>     40        0.3774             nan     0.1000    0.0001
#>     60        0.3295             nan     0.1000   -0.0014
#>     80        0.2936             nan     0.1000   -0.0009
#>    100        0.2704             nan     0.1000   -0.0028
#>    120        0.2482             nan     0.1000   -0.0009
#>    140        0.2311             nan     0.1000   -0.0002
#>    150        0.2233             nan     0.1000   -0.0016
#> 
#> Iter   TrainDeviance   ValidDeviance   StepSize   Improve
#>      1        1.0565             nan     0.1000    0.0553
#>      2        0.9672             nan     0.1000    0.0465
#>      3        0.8913             nan     0.1000    0.0368
#>      4        0.8304             nan     0.1000    0.0276
#>      5        0.7795             nan     0.1000    0.0242
#>      6        0.7363             nan     0.1000    0.0211
#>      7        0.6973             nan     0.1000    0.0171
#>      8        0.6692             nan     0.1000    0.0123
#>      9        0.6408             nan     0.1000    0.0126
#>     10        0.6183             nan     0.1000    0.0101
#>     20        0.4880             nan     0.1000    0.0035
#>     40        0.4018             nan     0.1000   -0.0001
#>     50        0.3766             nan     0.1000   -0.0005
#> Loading required package: pROC
#> Type 'citation("pROC")' for a citation.
#> 
#> Attaching package: 'pROC'
#> The following objects are masked from 'package:stats':
#> 
#>     cov, smooth, var
#> Loading required package: PRROC
#> Loading required package: ModelMetrics
#> 
#> Attaching package: 'ModelMetrics'
#> The following object is masked from 'package:pROC':
#> 
#>     auc
#> The following objects are masked from 'package:caret':
#> 
#>     confusionMatrix, precision, recall, sensitivity, specificity
#> The following object is masked from 'package:base':
#> 
#>     kappa
#> Setting levels: control = N, case = Y
#> Setting direction: controls < cases

try ?mlho::mlearn to learn more about mlearn parameters.

you can select a variety of models. For binray classification tasks, chose from:ORFlog, RRF,gbm,bayesglm,regLogistic,xgbDART,regLogistic, glmnet, bayesglm, nb, svmRadialWeights,avNNet,and ordinalRF. see the list and description of all available models

mlearn outputs

outputs of mlearn include:

  • model performance metrics: model.test$ROC
    • in model.test AUROC is 0.9438009
  • final important features, which can be pulled and save from model.test$features
  • absolute errors for each patients using model.test$AE
  • missing important features model.test$missing.features

For instance, only the following 44 features were used in phenotyping the prediabetes:

model.test$features
#>         Overall  features classifier         aoi
#> 3     0.4753446 117015009        gbm prediabetes
#> 4     0.1307140 195662009        gbm prediabetes
#> 5     2.1585160 430193006        gbm prediabetes
#> 9     3.4754618  73761001        gbm prediabetes
#> 16    0.2179180    197604        gbm prediabetes
#> 17    1.6629729    314231        gbm prediabetes
#> 20    0.6289658  40055000        gbm prediabetes
#> 24  100.0000000 271737000        gbm prediabetes
#> 26    0.1185965    746030        gbm prediabetes
#> 27    1.6203613  44054006        gbm prediabetes
#> 28    0.1909684  68496003        gbm prediabetes
#> 31    0.7985321 237602007        gbm prediabetes
#> 43    0.1560429    309362        gbm prediabetes
#> 46    3.9806836    860975        gbm prediabetes
#> 47    0.1673445    896209        gbm prediabetes
#> 48    0.4946870   2001499        gbm prediabetes
#> 49    5.2727700  19169002        gbm prediabetes
#> 51    0.1538520  80394007        gbm prediabetes
#> 52    0.4099686 127013003        gbm prediabetes
#> 55    0.5944350 410429000        gbm prediabetes
#> 57    0.8275496 431855005        gbm prediabetes
#> 73    1.8496775   5880005        gbm prediabetes
#> 75    1.4887223  28163009        gbm prediabetes
#> 79    0.6398999  60027007        gbm prediabetes
#> 80    0.1326032  62106007        gbm prediabetes
#> 84    0.3620042 104091002        gbm prediabetes
#> 91    1.3197059 169230002        gbm prediabetes
#> 93    0.2964427 171207006        gbm prediabetes
#> 94    1.1574410 225158009        gbm prediabetes
#> 97    0.4409876 268556000        gbm prediabetes
#> 109   0.1463869 237001001        gbm prediabetes
#> 111   1.7055298  14768001        gbm prediabetes
#> 114   0.1177220    429503        gbm prediabetes
#> 116   0.5612521    849574        gbm prediabetes
#> 118   0.3267371 312681000        gbm prediabetes
#> 119   0.2292991    904419        gbm prediabetes
#> 120   0.2666824  64859006        gbm prediabetes
#> 126   0.4764296 230690007        gbm prediabetes
#> 127   0.3205344 239873007        gbm prediabetes
#> 128   0.1513470    807283        gbm prediabetes
#> 143   1.2814768    310325        gbm prediabetes
#> 153   0.8606760    895994        gbm prediabetes
#> 155   1.3815004    999967        gbm prediabetes
#> 204   0.9976881    sex_cd        gbm prediabetes

we recommend iterating the training and testing and storing the coefficients in a directory.

Visualizing the outputs

Here we create a plot of the feature importance scores for each of the top (here we have ) predictors identified by MLHO.

To do so, let’s map the concept codes to their “English” translation. That’s why we kept that 4th column called description in dbmart.

dbmart.concepts <- dbmart[!duplicated(paste0(dbmart$phenx)), c("phenx","DESCRIPTION")]
mlho.features <- data.frame(merge(model.test$features,dbmart.concepts,by.x="features",by.y = "phenx"))
datatable(dplyr::select(mlho.features,features,DESCRIPTION,`Feature importance`=Overall), options = list(pageLength = 5), filter = 'bottom')

now visualizing feature importance

(plot<- ggplot(mlho.features) +
    geom_segment(
      aes(y = 0,
          x = reorder(DESCRIPTION,Overall),
          yend = Overall,
          xend = DESCRIPTION),
      size=0.5,alpha=0.5) +
    geom_point(
      aes(x=reorder(DESCRIPTION,Overall),y=Overall),
      alpha=0.5,size=2,color="red") +
    theme_minimal()+
   coord_flip()+
    labs(y="Feature importance",x=""))

Notes for more complex implementation

As we recommended above, it’s best to at least iterate the training process 5 times, in which case you’ll have 5 files containing the important features from MLHO. We do this regularly. If you do so, you can compute the mean/median values for each feature.

Now if you want to get the regression coefficients and compute Odds Ratios (ORs), you can then select the top feature that were selected most frequently in MLHO iterations to run a regression model (pass glm to mlearn) and extract regression coefficients to then transform to ORs.