@inproceedings{, author = {Cimitan, Ana; Alves Pinto, Ana; Geierhos, Michaela}, title = {Curation of Benchmark Templates for Measuring Gender Bias in Named Entity Recognition Models}, editor = {Calzolari, Nicoletta; Kan, Min-Yen; Hoste, Veronique; Lenci, Alessandro; Sakti, Sakriani; Xue, Nianwen}, booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)}, series = {}, journal = {}, address = {}, publisher = {ELRA}, edition = {}, year = {2024}, isbn = {}, volume = {}, number = {}, pages = {4238-4246}, url = {https://aclanthology.org/2024.lrec-main.378.pdf}, doi = {}, keywords = {BERT ; masked token prediction ; gender gap}, abstract = {Named Entity Recognition (NER) constitutes a popular machine learning technique that empowers several natural language processing applications. As with other machine learning applications, NER models have been shown to be susceptible to gender bias. The latter is often assessed using benchmark datasets, which in turn are curated specifically for a given Natural Language Processing (NLP) task. In this work, we investigate the robustness of benchmark templates to detect gender bias and propose a novel method to improve the curation of such datasets. The method, based on masked token prediction, aims to filter out benchmark templates with a higher probability of detecting gender bias in NER models. We tested the method for English and German, using the corresponding fine-tuned BERT base model (cased) as the NER model. The gender gaps detected with templates classified as appropriate by the method were statistically larger than those detected with inappropriate templates. The results were similar for both languages and support the use of the proposed method in the curation of templates designed to detect gender bias.}, note = {}, institution = {Universität der Bundeswehr München, Fakultät für Informatik, INF 7 - Institut für Datensicherheit, Professur: Geierhos, Michaela}, }