Probabilistic Matching Workflow

2. Reduction of search space: Blocking
3. Comparison: String metrics, year comparisons, numeric comparisons
4. Classification: Fellegi-Sunter Model
5. Final prediction: cut off scores, validation
library(yaml)
library(haven)
library(rvest)
library(RecordLinkage)
## Datos de legisladores

extraer_datos <- function(x) {
x <- data.frame("official"=x$name$official_full,
"state"=unique(sapply(x$terms, function(i) i$state)),
"party"=unique(sapply(x$terms, function(i) i$party)))
return(x)
}
legs <- do.call(rbind, lapply(legs, extraer_datos))

## Extraer datos de contribuciones a legisladores

html_nodes("table") %>%
html_nodes("a") %>%
html_attr("href") %>%
.[-c(1:4)]

data <- mapl %>%
html_nodes("table") %>%
.[[1]] %>%
html_table()

## Ahora podriamos recuperar la tabla de donantes
donations <- cbind(data, links)
naive <- merge(legs, donations,
by.x="official",
by.y="Name",
all.y=FALSE)
nrow(naive) ## Nos falta mas de 100
## Preparar los datos
## Asegurarse de que las blocking variables son iguales
donations$State <- gsub("^([A-Z]{2})\\-.*", "\\1", donations$State)
legs$state <- as.character(legs$state) ## Que pasaria si fuese un factor
legs$party <- substring(legs$party, 1, 1)
## Por ahora no necesitamos estos datos
donations$Chamber <- NULL donations$links <- NULL
## Poner las variables en el mismo orden
legs <- legs[, c("official", "party", "state")]
names(donations) <- names(legs)
## No necesitamos ni mayusculas ni puntuacion
donations$official <- tolower(donations$official)
legs$official <- tolower(legs$official)
donations$official <- gsub("[[:punct:]]", "", donations$official)
legs$official <- gsub("[[:punct:]]", "", legs$official)
## Fusion de registros
blockfld=c(2, 3),
strcmp=TRUE,
strcmpfun=levenshteinSim)

Calculate M and U weights using the EM algorithm

## Crear pesos usando barrera
rdist <- emWeights(rpairs, .6)

Define 0.8 as cut off for string comparators, converting distance metrics into 0/1 binary.

Initial estimates of M and U are set by the algorithm using frequencies of specific values for each column.

## Revision
tail(getPairs(rdist, 12, 11), 24)
## Regla de decision
res <- emClassify(rdist, 11, 10)
## Reconstruir la base de datos y comprobar que todo es correcto
head(res$data1) head(res$data2)
linked <- cbind(res$pairs[res$prediction == "L", ], res$prediction[res$prediction == "L"])
cbind(res$data1[linked$id1, 'official'], res$data2[linked$id2, 'official'])
check <- cbind(res$pairs[res$prediction == "P", ], res$prediction[res$prediction == "P"])
cbind(res$data1[check$id1, 'official'], res$data2[check$id2, 'official'])