Probabilistic Matching Workflow

  1. Preprocessing: developing link keys, extracting information from link keys, normalization of link keys
  2. Reduction of search space: Blocking
  3. Comparison: String metrics, year comparisons, numeric comparisons
  4. Classification: Fellegi-Sunter Model
  5. Final prediction: cut off scores, validation
library(yaml)
library(haven)
library(rvest)
library(RecordLinkage)
## Datos de legisladores
legs <- yaml.load_file("./dta/legislators-current.yaml")

extraer_datos <- function(x) {
    x <- data.frame("official"=x$name$official_full,      
                    "state"=unique(sapply(x$terms, function(i) i$state)),
                    "party"=unique(sapply(x$terms, function(i) i$party)))
    return(x)
}
legs <- do.call(rbind, lapply(legs, extraer_datos))

## Extraer datos de contribuciones a legisladores
mapl <- read_html("http://maplight.org/us-congress/legislator")

links <- mapl %>%
    html_nodes("table") %>%
    html_nodes("a") %>%
    html_attr("href") %>%
    .[-c(1:4)]

data <- mapl %>%
    html_nodes("table") %>%
    .[[1]] %>%
    html_table() 

## Ahora podriamos recuperar la tabla de donantes
donations <- cbind(data, links)
naive <- merge(legs, donations,
               by.x="official",
               by.y="Name",
               all.y=FALSE)
head(naive)
nrow(naive) ## Nos falta mas de 100
## Preparar los datos
## Asegurarse de que las blocking variables son iguales
donations$State <- gsub("^([A-Z]{2})\\-.*", "\\1", donations$State)
legs$state <- as.character(legs$state) ## Que pasaria si fuese un factor
legs$party <- substring(legs$party, 1, 1)
## Por ahora no necesitamos estos datos
donations$Chamber <- NULL
donations$links <- NULL
## Poner las variables en el mismo orden
legs <- legs[, c("official", "party", "state")]
names(donations) <- names(legs)
## No necesitamos ni mayusculas ni puntuacion
donations$official <- tolower(donations$official)
legs$official <- tolower(legs$official)
donations$official <- gsub("[[:punct:]]", "", donations$official)
legs$official <- gsub("[[:punct:]]", "", legs$official)
## Fusion de registros
rpairs <- compare.linkage(legs, donations,                          
                          blockfld=c(2, 3),
                          strcmp=TRUE,
                          strcmpfun=levenshteinSim)

Calculate M and U weights using the EM algorithm

## Crear pesos usando barrera
rdist <- emWeights(rpairs, .6)

Define 0.8 as cut off for string comparators, converting distance metrics into 0/1 binary.

Initial estimates of M and U are set by the algorithm using frequencies of specific values for each column.

## Revision
tail(getPairs(rdist, 12, 11), 24)
## Regla de decision
res <- emClassify(rdist, 11, 10)
## Reconstruir la base de datos y comprobar que todo es correcto
head(res$data1)
head(res$data2)
linked <- cbind(res$pairs[res$prediction == "L", ], res$prediction[res$prediction == "L"])
cbind(res$data1[linked$id1, 'official'], res$data2[linked$id2, 'official'])
check <- cbind(res$pairs[res$prediction == "P", ], res$prediction[res$prediction == "P"])
cbind(res$data1[check$id1, 'official'], res$data2[check$id2, 'official'])

Hacer segue a supervised learning