Probabilistic Matching Workflow
library(yaml)
library(haven)
library(rvest)
library(RecordLinkage)
## Datos de legisladores
legs <- yaml.load_file("./dta/legislators-current.yaml")
extraer_datos <- function(x) {
x <- data.frame("official"=x$name$official_full,
"state"=unique(sapply(x$terms, function(i) i$state)),
"party"=unique(sapply(x$terms, function(i) i$party)))
return(x)
}
legs <- do.call(rbind, lapply(legs, extraer_datos))
## Extraer datos de contribuciones a legisladores
mapl <- read_html("http://maplight.org/us-congress/legislator")
links <- mapl %>%
html_nodes("table") %>%
html_nodes("a") %>%
html_attr("href") %>%
.[-c(1:4)]
data <- mapl %>%
html_nodes("table") %>%
.[[1]] %>%
html_table()
## Ahora podriamos recuperar la tabla de donantes
donations <- cbind(data, links)
naive <- merge(legs, donations,
by.x="official",
by.y="Name",
all.y=FALSE)
head(naive)
nrow(naive) ## Nos falta mas de 100
## Preparar los datos
## Asegurarse de que las blocking variables son iguales
donations$State <- gsub("^([A-Z]{2})\\-.*", "\\1", donations$State)
legs$state <- as.character(legs$state) ## Que pasaria si fuese un factor
legs$party <- substring(legs$party, 1, 1)
## Por ahora no necesitamos estos datos
donations$Chamber <- NULL
donations$links <- NULL
## Poner las variables en el mismo orden
legs <- legs[, c("official", "party", "state")]
names(donations) <- names(legs)
## No necesitamos ni mayusculas ni puntuacion
donations$official <- tolower(donations$official)
legs$official <- tolower(legs$official)
donations$official <- gsub("[[:punct:]]", "", donations$official)
legs$official <- gsub("[[:punct:]]", "", legs$official)
## Fusion de registros
rpairs <- compare.linkage(legs, donations,
blockfld=c(2, 3),
strcmp=TRUE,
strcmpfun=levenshteinSim)
Calculate M and U weights using the EM algorithm
## Crear pesos usando barrera
rdist <- emWeights(rpairs, .6)
Define 0.8 as cut off for string comparators, converting distance metrics into 0/1 binary.
Initial estimates of M and U are set by the algorithm using frequencies of specific values for each column.
## Revision
tail(getPairs(rdist, 12, 11), 24)
## Regla de decision
res <- emClassify(rdist, 11, 10)
## Reconstruir la base de datos y comprobar que todo es correcto
head(res$data1)
head(res$data2)
linked <- cbind(res$pairs[res$prediction == "L", ], res$prediction[res$prediction == "L"])
cbind(res$data1[linked$id1, 'official'], res$data2[linked$id2, 'official'])
check <- cbind(res$pairs[res$prediction == "P", ], res$prediction[res$prediction == "P"])
cbind(res$data1[check$id1, 'official'], res$data2[check$id2, 'official'])