Supplemenetary materials for Quantifying and Reducing Speaker Heterogeneity within the Common Voice Corpus for Phonetic Analysis (Interspeech 2025)

Miao Zhang, Aref Farhadipour, Annie Baker, Jiachen Ma, Bogdan Pricop, Eleanor Chodroff

2025/05/30

Similarity scores

Download the similarity score file from VoxCommunis.

col_names <- c("test", "enroll", "score")
# Replace the file path with the one on your computer
scores <- read_delim("/Users/miaozhang/Documents/GitHub/CV_clientID_cleaning/similarity_scores.txt", col_names = col_names, show_col_types = FALSE)
threshold = 0.38288
scores <- scores |> mutate(lang_code = str_extract(enroll, "(?<=voice_)[^_]+"), # Get the languages
                           under_threshold = if_else(score < threshold, T, F))
glimpse(scores)
## Rows: 9,204,867
## Columns: 5
## $ test            <chr> "common_voice_ab_27923256.wav", "common_voice_ab_29441…
## $ enroll          <chr> "common_voice_ab_27923260.wav", "common_voice_ab_29441…
## $ score           <dbl> 0.4625, 0.5034, 0.6747, 0.2018, 0.0490, 0.6149, 0.0552…
## $ lang_code       <chr> "ab", "ab", "ab", "ab", "ab", "ab", "ab", "ab", "ab", …
## $ under_threshold <lgl> FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TR…

Plot the distribution of the similarity scores across all languages.

scores |> 
  ggplot(aes(score)) +
  geom_histogram(aes(fill = after_stat(density)), bins = 40,
                 show.legend = F) +
  #geom_vline(xintercept = 0.38, color = "red3", linewidth = 0.5) +
  scale_fill_viridis_c() +
  scale_y_continuous(expand = expansion(mult = c(0, .1)),
                     label = scales::label_number(scale = 1/100000)) +
  scale_x_continuous(breaks = c(0, 0.4, 0.8)) +
  labs(y = expression("Count (\u00D710,000)"), x = "Similarity score") +
  coord_cartesian(xlim = c(-0.1, 1.0)) +
  theme(panel.grid.major.x = element_line(linewidth = 0.4),
        panel.grid.minor.x = element_line(linewidth = 0.2),
        panel.grid.major.y = element_line(linewidth = 0.4),
        axis.text.x = element_text(size=14),
        axis.text.y = element_text(size=14),
        axis.title.x = element_text(size=16),
        axis.title.y = element_text(size=13)) 

The next step require downloading speaker files from VoxCommunis Corpus.

# Replace the path below with the one on your own computer
spkr_dir <- "/Users/miaozhang/Research/VoxCommunis/VoxCommunis_Huggingface/speaker_files"
spkr_files <- list.files(spkr_dir, pattern = "*.tsv", full.names = T)

spkr_files <- map_dfr(spkr_files, \(x) read_tsv(x, col_select = c("path", "speaker_id"), show_col_types = FALSE))
spkr_files <- mutate(spkr_files, speaker_id = paste(str_extract(path, "(?<=voice_)[^_]+"), speaker_id, sep = "_"))
spkr_files <- rename(spkr_files, test = path)

spkr_files <- mutate(spkr_files, test = str_replace(test, ".mp3", ""))
scores <- mutate(scores, test = str_replace(test, ".wav", ""))

scores <- inner_join(scores, spkr_files, by = "test")
glimpse(scores)
## Rows: 8,497,101
## Columns: 6
## $ test            <chr> "common_voice_ab_27923256", "common_voice_ab_29441754"…
## $ enroll          <chr> "common_voice_ab_27923260.wav", "common_voice_ab_29441…
## $ score           <dbl> 0.4625, 0.5034, 0.6747, 0.2018, 0.0490, 0.6149, 0.0552…
## $ lang_code       <chr> "ab", "ab", "ab", "ab", "ab", "ab", "ab", "ab", "ab", …
## $ under_threshold <lgl> FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TR…
## $ speaker_id      <chr> "ab_5", "ab_6", "ab_14", "ab_7", "ab_8", "ab_8", "ab_9…
prop_under_threshold_by_lang <- scores |> 
  summarize(prop_threshold = sum(under_threshold)/n(), .by = lang_code)

Evaluate how much the client IDs are affected by the heterogeneous speaker issue by looking into how many client IDs that contain recordings from multiple speakers there are.

speaker_id_impact <- scores |> 
  summarize(n = n(), prop_threshold = sum(under_threshold)/n(), .by = speaker_id) |> 
  mutate(affected = if_else(prop_threshold > 0, T, F),
         more_than_10 = if_else(prop_threshold > 0.10, T, F),
         lang = str_extract(speaker_id, "^[^_]+")) 
n_affected_more_than_10 <- speaker_id_impact |> 
  summarize(n_affected = sum(more_than_10),
            n_total = n(), 
            perc = n_affected/n_total,
            .by = lang) 

prop_more_than_100 <- scores |> summarize(n = n() + 1, .by = c(lang_code, speaker_id)) |> 
  mutate(more_than_100 = if_else(n >= 100, T, F)) |> 
  summarize(prop_more_than_100 = sum(more_than_100)/n(),
            n = n(),
            .by = lang_code)

Plot the the degree to which client IDs and languages are affected.

# The proportion of files in each language with a score under the threshold
prop_under_threshold_by_lang |> 
  ggplot(aes(reorder(lang_code, -prop_threshold), prop_threshold, fill = prop_threshold)) +
  geom_col() +
  scale_fill_viridis_c() +
  scale_y_continuous(labels = scales::percent_format(), lim = c(0,0.3), 
                     expand = expansion(mult = c(0, .01))) +
  guides(fill = "none") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.35, size = 11),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid.major.y = element_line(linetype = 3),
        legend.title = element_blank())
# The proportion of client IDs that have more than 10% of the associated recordings with a score lower than the threshold
ggplot(n_affected_more_than_10, aes(reorder(lang, desc(perc)), perc, fill = perc)) +
  geom_col() +
  scale_y_continuous(labels = scales::label_percent(), expand = expansion(mult = c(0, 0.1))) +
  scale_fill_viridis_c() +
  guides(fill = "none") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4, size = 11),
        axis.ticks.x = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        legend.title = element_blank(),
        panel.grid.major.y = element_line(linetype = 3))

Auditing result: round 1

Read in the auditing result from round 1.

audit_r1 <- read_csv("/Users/miaozhang/Documents/GitHub/CV_clientID_cleaning/audit_r1.csv") |> 
  create_scoreBin() |> 
  mutate(validation = factor(validation, 
                             levels = c("Different Speaker", "Audio Quality Issue", 
                                        "Missing Speech", "Not Sure", "Same Speaker")))
## Rows: 2048 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): test, enroll, lang, validation, score_bin
## dbl (4): score, up_votes, down_votes, speaker_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(audit_r1)
## Rows: 2,048
## Columns: 9
## $ test       <chr> "common_voice_ab_29496822.mp3", "common_voice_ab_29537063.m…
## $ enroll     <chr> "common_voice_ab_29550540.mp3", "common_voice_ab_29550540.m…
## $ score      <dbl> 0.1851, 0.2400, 0.2835, 0.4160, 0.4256, 0.1969, 0.0090, 0.1…
## $ lang       <chr> "ab", "ab", "ab", "ab", "ab", "ab", "ab", "ab", "ab", "ab",…
## $ up_votes   <dbl> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ down_votes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ speaker_id <dbl> 367, 367, 367, 324, 266, 367, 355, 247, 364, 367, 364, 353,…
## $ validation <fct> Different Speaker, Different Speaker, Different Speaker, Di…
## $ score_bin  <fct> < 0.2, < 0.3, < 0.3, < 0.5, < 0.5, < 0.2, < 0.1, < 0.2, < 0…

Plot the round 1 results.

ggplot(audit_r1, aes(x = score_bin, fill = validation)) + 
  stat_count(position = "dodge") +
  scale_fill_viridis_d(end = 0.98, begin = 0.05, alpha = 0.9) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  theme_ggdist(base_size = 6.5) +
  theme(axis.title.x = element_blank(),
        panel.grid.major.y = element_line(linetype = 3, linewidth = 0.8),
        axis.text = element_text(size = 16),
        axis.title.y = element_text(size = 20),
        legend.text = element_text(size = 13),
        legend.position = "bottom",
        legend.title = element_blank()) 

Auditing result: round 1

Get the round 2 data.

audit_r2 <- read_csv("/Users/miaozhang/Documents/GitHub/CV_clientID_cleaning/audit_r2.csv") |> create_scoreBin()
## Rows: 150 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): lang, test, enroll, person1, validation1, person2, validation2, pe...
## dbl  (4): score, up_votes, down_votes, speaker_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(audit_r2)
## Rows: 150
## Columns: 18
## $ lang        <chr> "ab", "am", "am", "ba", "ba", "bas", "bas", "bas", "bas", …
## $ test        <chr> "common_voice_ab_29535966.mp3", "common_voice_am_37890165.…
## $ enroll      <chr> "common_voice_ab_29550540.mp3", "common_voice_am_37952755.…
## $ score       <dbl> 0.2567, 0.3594, 0.4269, 0.2330, 0.4308, 0.1913, 0.4854, 0.…
## $ up_votes    <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2…
## $ down_votes  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ speaker_id  <dbl> 367, 19, 19, 807, 884, 47, 48, 48, 48, 8210, 6455, 7154, 3…
## $ person1     <chr> "Annotator1", "Annotator1", "Annotator1", "Annotator1", "A…
## $ validation1 <chr> "Different Speaker", "Same Speaker", "Same Speaker", "Same…
## $ person2     <chr> "Annotator2", "Annotator2", "Annotator2", "Annotator2", "A…
## $ validation2 <chr> "Different Speaker", "Same Speaker", "Same Speaker", "Same…
## $ person3     <chr> "Annotator3", "Annotator3", "Annotator3", "Annotator3", "A…
## $ validation3 <chr> "Different Speaker", "Same Speaker", "Same Speaker", "Same…
## $ person4     <chr> "Annotator4", "Annotator4", "Annotator4", "Annotator4", "A…
## $ validation4 <chr> "Different Speaker", "Same Speaker", "Same Speaker", "Same…
## $ person5     <chr> "Annotator5", "Annotator5", "Annotator5", "Annotator5", "A…
## $ validation5 <chr> "Different Speaker", "Different Speaker", "Same Speaker", …
## $ score_bin   <fct> < 0.3, < 0.4, < 0.5, < 0.3, < 0.5, < 0.2, < 0.5, < 0.5, < …

Calculate the Fleiss’ Kappa.

kappa_dat <- audit_r2[, colnames(audit_r2) %in% c("validation1", "validation2", "validation3", "validation4", "validation5")]
kappam.fleiss(kappa_dat)
##  Fleiss' Kappa for m Raters
## 
##  Subjects = 150 
##    Raters = 5 
##     Kappa = 0.446 
## 
##         z = 21.6 
##   p-value = 0

Fit a GLMER model to evaluate the threshold to reject same speaker hypothesis.

audit_r2_long <- audit_r2 %>% pivot_longer(cols = starts_with("person"), 
                             names_to = ("person"), 
                             values_to = "name") %>% 
  pivot_longer(cols = starts_with("validation"), 
               names_to = ("validation"), 
               values_to = "val")
audit_r2_long <- subset(audit_r2_long, name == "Annotator1" & validation == "validation1" |
                   name == "Annotator2" & validation == "validation2" |
                   name == "Annotator3" & validation == "validation3" |
                   name == "Annotator4" & validation == "validation4" |
                   name == "Annotator5" & validation == "validation5")
glimpse(audit_r2_long)
## Rows: 750
## Columns: 12
## $ lang       <chr> "ab", "ab", "ab", "ab", "ab", "am", "am", "am", "am", "am",…
## $ test       <chr> "common_voice_ab_29535966.mp3", "common_voice_ab_29535966.m…
## $ enroll     <chr> "common_voice_ab_29550540.mp3", "common_voice_ab_29550540.m…
## $ score      <dbl> 0.2567, 0.2567, 0.2567, 0.2567, 0.2567, 0.3594, 0.3594, 0.3…
## $ up_votes   <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ down_votes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ speaker_id <dbl> 367, 367, 367, 367, 367, 19, 19, 19, 19, 19, 19, 19, 19, 19…
## $ score_bin  <fct> < 0.3, < 0.3, < 0.3, < 0.3, < 0.3, < 0.4, < 0.4, < 0.4, < 0…
## $ person     <chr> "person1", "person2", "person3", "person4", "person5", "per…
## $ name       <chr> "Annotator1", "Annotator2", "Annotator3", "Annotator4", "An…
## $ validation <chr> "validation1", "validation2", "validation3", "validation4",…
## $ val        <chr> "Different Speaker", "Different Speaker", "Different Speake…
samediff_all <- subset(audit_r2_long, val %in% c("Same Speaker","Different Speaker"))
samediff_all$nVal <- ifelse(samediff_all$val == "Same Speaker", 1, 0)
mod <- glmer(nVal ~ score + (1 | person) + (0 + score | person), family = "binomial", data = samediff_all)

Get the threshold.

threshold <- -fixef(mod)[1]/fixef(mod)[2]
threshold
## (Intercept) 
##   0.3874009

Plot the model result.

samediff_all$predicted <- predict(mod, type = "response", re.form = NA)
samediff_all$predicted_person <- predict(mod, type = "response")
preds <- ggpredict(mod, terms = "score[all]", interval = "confidence")
## You are calculating adjusted predictions on the population-level (i.e.
##   `type = "fixed"`) for a *generalized* linear mixed model.
##   This may produce biased estimates due to Jensen's inequality. Consider
##   setting `bias_correction = TRUE` to correct for this bias.
##   See also the documentation of the `bias_correction` argument.
ggplot(samediff_all, aes(x = score, y = nVal)) +
  geom_jitter(alpha = 0.08, width = 0.05, height = 0.05) +
  # Confidence interval ribbon
  geom_ribbon(data = preds, aes(x = x, ymin = conf.low, ymax = conf.high), 
              fill = "deepskyblue3", alpha = 0.2, inherit.aes = FALSE) +
  # Model prediction line
  geom_line(data = preds, aes(x = x, y = predicted), 
            color = "deepskyblue3", size = 1.2, inherit.aes = FALSE) +
  scale_x_continuous(limits = c(-0.2, 1)) +
  labs(x = "Similarity Score", y = "Probability") +
  geom_vline(xintercept = threshold, linetype = "dashed", color = "red") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank()) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.