@article{c4c208e655ec496e8ac575a517127d6c,
title = "Building a best-in-class automated de-identification tool for electronic health records through ensemble learning",
abstract = "The presence of personally identifiable information (PII) in natural language portions of electronic health records (EHRs) constrains their broad reuse. Despite continuous improvements in automated detection of PII, residual identifiers require manual validation and correction. Here, we describe an automated de-identification system that employs an ensemble architecture, incorporating attention-based deep-learning models and rule-based methods, supported by heuristics for detecting PII in EHR data. Detected identifiers are then transformed into plausible, though fictional, surrogates to further obfuscate any leaked identifier. Our approach outperforms existing tools, with a recall of 0.992 and precision of 0.979 on the i2b2 2014 dataset and a recall of 0.994 and precision of 0.967 on a dataset of 10,000 notes from the Mayo Clinic. The de-identification system presented here enables the generation of de-identified patient data at the scale required for modern machine-learning applications to help accelerate medical discoveries.",
keywords = "DSML 4: Production: Data science output is validated, understood, and regularly used for multiple domains/platforms, anonymization, de-identification, ensemble, mayo, nference, obfuscation",
author = "Karthik Murugadoss and Ajit Rajasekharan and Bradley Malin and Vineet Agarwal and Sairam Bade and Anderson, {Jeff R.} and Ross, {Jason L.} and Faubion, {William A.} and Halamka, {John D.} and Venky Soundararajan and Sankar Ardhanari",
note = "Funding Information: We would like to thank the Mayo Clinic and the Mayo Clinic IRB, under whose auspices the development of the de-identification methods and testing against real-world datasets were made possible. We thank the nurse abstractors—Wendy Gay, Kathy Richmond, Denise Herman, Sandra Severson, Dawn Pereda, and Jane Emerson—for annotating the ground truth for the 172,102 sentences in the Mayo dataset that was used for testing the performance of the system; the Mayo Data Team of Ahmed Hadad, Connie Nehls, and Salena Tong for preparing and helping us understand the Mayo EHR data; and Andy Danielsen for supporting the collaboration. Finally, we thank Murali Aravamudan, Rakesh Barve, and A.J. Venkatakrishnan for their thoughtful review and feedback on the manuscript. Conceptualization, K.M. A.R. and S.A.; methodology, K.M. A.R. V.A. and S.A.; validation, K.M. B.M. V.A. S.B. J.A. J.R. and S.A.; formal analysis, K.M. B.M. and S.A.; data curation, K.M. V.A. and S.B.; writing – original draft, K.M. and A.R.; writing – review & editing, B.M. J.A. J.R. W.F. J.H. V.S. and S.A.; supervision, V.S. and S.A.; resources, V.S. and S.A. J.A. J.H. and W.F. do not have any competing interests in this project. B.M. is a contracted consultant of the Mayo Clinic. The authors on this article from nference have equity in nference and have a financial interest in nference. A patent application has been submitted by K.M. A.R. and S.A. Mayo Clinic and nference may stand to gain financially from the successful outcome of the research. Publisher Copyright: {\textcopyright} 2021 The Authors",
year = "2021",
month = jun,
day = "11",
doi = "10.1016/j.patter.2021.100255",
language = "English (US)",
volume = "2",
journal = "Patterns",
issn = "2666-3899",
publisher = "Cell Press",
number = "6",
}