API_KEY <-Sys.getenv("DEEPL_API")# educationLevelmp$educationLevel =replace(mp$educationLevel, mp$educationLevel=="wyższe","higher")mp$educationLevel =replace(mp$educationLevel, mp$educationLevel=="średnie ogólne","tertiary")mp$educationLevel =replace(mp$educationLevel, mp$educationLevel=="średnie zawodowe","vocational tertiary")mp$educationLevel =replace(mp$educationLevel, mp$educationLevel=="średnie policealne/pomaturalne","post-secondary")# professionmp$profession[is.na(mp$profession)] <-''mp$profession <-translate2(text = mp$profession, source_lang ="PL",target_lang ="EN",auth_key = API_KEY)mp$profession[mp$profession ==''] <-NA# inactiveCausemp$inactiveCause =replace(mp$inaciveCause, mp$inactiveCause=="Zrzeczenie","abdication")mp$inactiveCause =replace(mp$inactiveCause, mp$inactiveCause=="Zgon","death")# waiverDescmp$waiverDesc =replace(mp$waiverDesc, mp$waiverDesc=="Zmarł","Died")mp$waiverDesc =replace(mp$waiverDesc, mp$waiverDesc=="Objął mandat posła do Parlamentu Europejskiego","Became a member of the European Parliament")mp$waiverDesc =replace(mp$waiverDesc, mp$waiverDesc=="Utrata prawa wybieralności","Loss of the right to be elected")mp$waiverDesc =replace(mp$waiverDesc, mp$waiverDesc=="Wybrany na funkcję Prezydenta Miasta Krakowa","Elected as Mayor of the City of Krakow")mp$waiverDesc =replace(mp$waiverDesc, mp$waiverDesc=="Wybrany na funkcję Prezydenta Miasta Leszna","Elected as Mayor of the City of Leszno")mp$waiverDesc =replace(mp$waiverDesc, mp$waiverDesc=="Powołany na członka Zarządu Narodowego Banku Polskiego","Appointed as a member of the Management Board of the National Bank of Poland")
3 Background of the data
This data has been downloaded from API of Polish Parliament, to analyze the general statistics considering Polish MPs, and to give an insights on political processes of the Polish political body.
4 Description of the Data
4.1 Members of Parliament
Code
datatable(mp, filter ="top")
active: Determines if MP is an active MP
birthDate: Date of birth
birthLocation: Location of birth
club: Club membership
districtName: Name of the district of the MP
districtNum: Number of the district of the MP
educationLevel: Education level of the MP
email: E-mail to the MP
firstLastName: First and last name of the MP
firstName: First name of the MP
id: Id of the MP
lastFirstName: Last and first name of the MP
lastName: Last name of the MP
numberOfVotes: Number of votes for the MP
profession: Profession of the MP
secondName: Second name
voivodeship: Voivodeship of the MP
inactiveCause: The cause why MP is inactive (if inactive)
waiverDesc: Description of the cause why MP is inactive (if inactive)
The Polish Parliament API provides the data on all the voting, including the votes of a single MP. It provides a great opportunity to statistically analyze, and visualize the processes behind most important decisions in Poland.
15 Loading the Data
Code
# Two votings types: Yes/No; on list.# Ignoring on list, as not significant number of entries.votings_path <-"./Data/votings/"# Yes/No votingsy_n_votings <-read.csv(paste0(votings_path, "y_n_votings.csv"))
16 Cleaning the data
We are dropping yes column, as it contains number of yes votes, votes.firstName, votes.lastName, votes.secondName, votes.club, term, notParticipating, totalVoted, and no as those data is redundant or is not relevant. We would also drop kind as it has {R} length(unique(y_n_votings$kind)) unique value.
Code
# Yes/No votingsy_n_votings <- y_n_votings %>%select(!(c(yes, notParticipating, votes.firstName, votes.lastName, votes.secondName, votes.club, term, totalVoted, no, kind)))y_n_votings$date <-as.POSIXct(y_n_votings$date, format ="%Y-%m-%dT%H:%M:%S")
17 Description of the Data
The dataset after cleaning has the following columns:
abstain: 1 if MP abstained during voting, 0 otherwise
date: Date of the voting
sitting: Number of sitting
sittingDay: Day of the sitting
title: Title of the voting
votes.MP: ID of the MP
votes.vote: Vote of the MP
votingNumber: Number of voting
topic: Topic of the voting
description: Description of the voting
Sessions of Polish Parliament are divided into points, and votings. In this data, title column regards the point of the session, when topis, and description regard voting.
18 Number of votings per sitting, with days distinction
Code
votings_per_sitting <- y_n_votings %>%group_by(sitting, sittingDay) %>%summarise(number_of_votings =max(votingNumber))# Plot dataggplot(votings_per_sitting, aes(x =factor(sitting),y = number_of_votings,fill =factor(sittingDay))) +geom_bar(stat ="identity", position ="dodge") +labs(title ="Number of Votings per Sitting with Days Distinction",x ="Sitting Number",y ="Number of Votings",fill ="Sitting Day") +theme_minimal()
Let’s create a function to calculate the similarity of two MPs
Code
calculate_similarity <-function(mp1_votes, mp2_votes) { merged_votes <-merge(mp1_votes, mp2_votes, by =c("sitting", "votingNumber"),suffixes =c("_mp1", "_mp2"))# Calculate similarity score (percentage of matching votes) total_votes <-nrow(merged_votes)if (total_votes ==0) {return(0) # No common votes } matching_votes <-sum(merged_votes$votes.vote_mp1 == merged_votes$votes.vote_mp2, na.rm =TRUE) similarity_score <- matching_votes / total_votesreturn(similarity_score)}
We want to create the networks around leaders of the clubs
Code
leaders_ids <-c(148, 400, 241, 133, 174, 58)
24.1 Similarity network graphs
Code
build_similarity_network <-function(mp_data, voting_data, threshold =0.6) { mp_ids <-unique(mp_data$id) n_mps <-length(mp_ids) similarity_matrix <-matrix(0, nrow = n_mps, ncol = n_mps)rownames(similarity_matrix) <- mp_idscolnames(similarity_matrix) <- mp_idsfor (i in1:(n_mps -1)) {for (j in (i +1):n_mps) { mp1_data <-filter_mp_data(mp_ids[i], mp_data, voting_data) mp2_data <-filter_mp_data(mp_ids[j], mp_data, voting_data) similarity_score <-calculate_similarity(mp1_data$votes, mp2_data$votes) similarity_matrix[i, j] <- similarity_score similarity_matrix[j, i] <- similarity_score } } similarity_matrix <- (similarity_matrix +t(similarity_matrix)) /2 graph <-graph_from_adjacency_matrix(similarity_matrix, mode ="undirected", weighted =TRUE,diag =FALSE) graph <-delete_edges(graph, E(graph)[E(graph)$weight < threshold])# Print diagnostic informationprint(paste("Number of vertices in graph:", vcount(graph)))print(paste("Number of rows in mp_data:", nrow(mp_data)))print(paste("Number of unique IDs in mp_data:", length(unique(mp_data$id))))# Check if all graph vertex names are in mp_data$id missing_ids <-setdiff(V(graph)$name, mp_data$id)if (length(missing_ids) >0) {warning("Some vertex IDs are not present in mp_data$id")print(paste("Number of missing IDs:", length(missing_ids)))print("First few missing IDs:")print(head(missing_ids)) }# Create a data frame for vertex attributes, handling potential mismatches vertex_attr <-data.frame(name =V(graph)$name,stringsAsFactors =FALSE )# Match attributes, using NA for any unmatched IDs vertex_attr$club <- mp_data$club[match(vertex_attr$name, mp_data$id)] vertex_attr$firstLastName <- mp_data$firstLastName[match(vertex_attr$name, mp_data$id)]# Add attributes to the graph graph <-set_vertex_attr(graph, "club", value = vertex_attr$club) graph <-set_vertex_attr(graph, name ="firstLastName",value = vertex_attr$firstLastName)# Print diagnostic information about attributesprint(paste("Number of vertices with 'club' attribute:",sum(!is.na(vertex_attr$club))))print(paste("Number of vertices with 'firstLastName' attribute:",sum(!is.na(vertex_attr$firstLastName))))return(graph)}
Code
# Function to assign colors to clubsassign_club_colors <-function(graph) { clubs <-unique(vertex_attr(graph, "club")) clubs <- clubs[!is.na(clubs) & clubs !=""] n_clubs <-length(clubs)if (n_clubs >0) { color_palette <-brewer.pal(max(3, min(n_clubs, 9)), "Set1") club_colors <-setNames(color_palette[1:min(n_clubs, 9)], clubs)V(graph)$color <- club_colors[vertex_attr(graph, "club")] } else {V(graph)$color <-"lightblue" }V(graph)$color[is.na(V(graph)$color) |V(graph)$color ==""] <-"gray"return(list(graph = graph, club_colors = club_colors))}# Function to set leader labelsset_leader_labels <-function(graph, leaders_ids) {V(graph)$label <-rep(NA, vcount(graph)) leader_indices <-which(V(graph)$name %in%as.character(leaders_ids))if (length(leader_indices) >0&&"firstLastName"%in%vertex_attr_names(graph)) { leader_names <-vertex_attr(graph, "firstLastName")[leader_indices] valid_names <-!is.na(leader_names) & leader_names !=""V(graph)$label[leader_indices[valid_names]] <- leader_names[valid_names] }return(list(graph = graph, leader_indices = leader_indices))}# Function to plot the graphplot_network <-function(graph, layout, leader_indices) {par(mar =c(1, 1, 2, 1), bg ="gray95")plot(graph, vertex.label =NA,edge.width =E(graph)$weight *0.4,edge.color =adjustcolor("gray40", alpha.f =0.4),edge.curved =0.2,layout = layout,rescale =TRUE,asp =0,main ="Network Visualization")grid(nx =NULL, ny =NULL, col ="gray90", lty ="dotted")}# Function to add leader labelsadd_leader_labels <-function(graph, layout, leader_indices) {for (i inseq_along(leader_indices)) { idx <- leader_indices[i]if (!is.na(V(graph)$label[idx])) { angle <-2* pi * i /length(leader_indices) x_offset <-cos(angle) *0.15 y_offset <-sin(angle) *0.15 coords <- layout[idx,]arrows(coords[1], coords[2], coords[1] + x_offset, coords[2] + y_offset, length =0.1, col ="black", lwd =1.5)text(coords[1] + x_offset *1.2, coords[2] + y_offset *1.2, V(graph)$label[idx], cex =0.9, font =2) } }}# Function to add legendadd_legend <-function(club_colors) { legend_colors <-c(club_colors, "black") legend_names <-c(names(club_colors), "Leaders") legend_pch <-c(rep(19, length(club_colors)), 19) # Changed to 19 for leaderslegend("topright", legend = legend_names, col = legend_colors, pch = legend_pch, pt.cex =1, title ="Clubs and Leaders", bty ="n", cex =0.7,ncol =2, x.intersp =0.5, y.intersp =0.7)}# Main visualization functionvisualize_network <-function(graph, leaders_ids) {# Assign colors and set labels color_result <-assign_club_colors(graph) graph <- color_result$graph club_colors <- color_result$club_colors label_result <-set_leader_labels(graph, leaders_ids) graph <- label_result$graph leader_indices <- label_result$leader_indices# Adjust node sizesV(graph)$size <-3V(graph)$size[leader_indices] <-8# Calculate layout layout <-layout_with_kk(graph)# Plot the graphplot_network(graph, layout, leader_indices)# Add leader labelsadd_leader_labels(graph, layout, leader_indices)# Add legendadd_legend(club_colors)}
24.2 Visualization
Code
graph <-build_similarity_network(mp, y_n_votings)
[1] "Number of vertices in graph: 464"
[1] "Number of rows in mp_data: 464"
[1] "Number of unique IDs in mp_data: 464"
[1] "Number of vertices with 'club' attribute: 464"
[1] "Number of vertices with 'firstLastName' attribute: 464"
The above tables with network visualize the connections between the MPs regarding their voting decisions in the parliament. It is left for the reader to interpret those information.
25 Summary
This project aimed to empower voters by providing a deeper understanding of political bodies through a detailed analysis of Members of Parliament and their voting patterns. By offering insights into general statistics and performing network analysis on voting behaviors, the project equips individuals with the knowledge needed to make more informed and thoughtful political decisions. Through data-driven insights, it promotes transparency and fosters greater civic engagement in the political process.
---title: "Analysis of Members of Polish Parliament"subtitle: "Based on data from api.sejm.gov.pl"author: "Borys Łangowicz (neloduka_sobe)"format: html: code-tools: true code-fold: truenumber-sections: trueeditor: visualinclude-after-body: "footer.html"editor_options: chunk_output_type: inline---# Loading the Data```{R, message=FALSE}# Librarieslibrary(knitr)library(dplyr, warn.conflicts =FALSE)options(dplyr.summarise.inform =FALSE)library(ggplot2)library(treemapify)library(DT)library(lubridate, warn.conflicts =FALSE)library(calendR)library(deeplr)library(igraph)library(RColorBrewer)# Datamp_path <-"./Data/mp.csv"clubs_path <-"./Data/clubs.csv"mp <-read.csv(mp_path)clubs <-read.csv(clubs_path)```# Translating the data```{R}API_KEY <-Sys.getenv("DEEPL_API")# educationLevelmp$educationLevel =replace(mp$educationLevel, mp$educationLevel=="wyższe","higher")mp$educationLevel =replace(mp$educationLevel, mp$educationLevel=="średnie ogólne","tertiary")mp$educationLevel =replace(mp$educationLevel, mp$educationLevel=="średnie zawodowe","vocational tertiary")mp$educationLevel =replace(mp$educationLevel, mp$educationLevel=="średnie policealne/pomaturalne","post-secondary")# professionmp$profession[is.na(mp$profession)] <-''mp$profession <-translate2(text = mp$profession, source_lang ="PL",target_lang ="EN",auth_key = API_KEY)mp$profession[mp$profession ==''] <-NA# inactiveCausemp$inactiveCause =replace(mp$inaciveCause, mp$inactiveCause=="Zrzeczenie","abdication")mp$inactiveCause =replace(mp$inactiveCause, mp$inactiveCause=="Zgon","death")# waiverDescmp$waiverDesc =replace(mp$waiverDesc, mp$waiverDesc=="Zmarł","Died")mp$waiverDesc =replace(mp$waiverDesc, mp$waiverDesc=="Objął mandat posła do Parlamentu Europejskiego","Became a member of the European Parliament")mp$waiverDesc =replace(mp$waiverDesc, mp$waiverDesc=="Utrata prawa wybieralności","Loss of the right to be elected")mp$waiverDesc =replace(mp$waiverDesc, mp$waiverDesc=="Wybrany na funkcję Prezydenta Miasta Krakowa","Elected as Mayor of the City of Krakow")mp$waiverDesc =replace(mp$waiverDesc, mp$waiverDesc=="Wybrany na funkcję Prezydenta Miasta Leszna","Elected as Mayor of the City of Leszno")mp$waiverDesc =replace(mp$waiverDesc, mp$waiverDesc=="Powołany na członka Zarządu Narodowego Banku Polskiego","Appointed as a member of the Management Board of the National Bank of Poland")```# Background of the dataThis data has been downloaded from [API of Polish Parliament](https://api.sejm.gov.pl/), to analyze the general statistics considering Polish MPs, and to give an insights on political processes of the Polish political body.# Description of the Data## Members of Parliament```{R}datatable(mp, filter ="top")```**`active`**: Determines if MP is an active MP**`birthDate`**: Date of birth**`birthLocation`**: Location of birth**`club`**: Club membership**`districtName`**: Name of the district of the MP**`districtNum`**: Number of the district of the MP**`educationLevel`**: Education level of the MP**`email`**: E-mail to the MP**`firstLastName`**: First and last name of the MP**`firstName`**: First name of the MP**`id`**: Id of the MP**`lastFirstName`**: Last and first name of the MP**`lastName`**: Last name of the MP**`numberOfVotes`**: Number of votes for the MP**`profession`**: Profession of the MP**`secondName`**: Second name**`voivodeship`**: Voivodeship of the MP**`inactiveCause`**: The cause why MP is inactive (if inactive)**`waiverDesc`**: Description of the cause why MP is inactive (if inactive)## Clubs```{R}kable(clubs)```**`email`**: E-mail of the club**`fax`**: Fax of the club**`id`**: Id of the club**`membersCount`**: Number of members**`name`**: Name of the club**`phone`**: Phone number of the club# Cleaning the DataWe would drop phone, email, and fax of the club, as it is data, we don't need for the statistical analysis```{R}clubs <- clubs %>%select(id, membersCount, name)```We would also drop `firstLastName`, and `lastFirstName`, as those are redundant in respect to `firstName`, `secondName`, and `lastName````{R}mp <- mp %>%select(!c(lastFirstName))```We also change `birthDate` to the date type```{R}mp <- mp %>%mutate(birthDate =as_date(birthDate))```# What is the club distribution in the parliament?```{R}club_distribution <- mp %>%group_by(club) %>%summarise(number_of_members =n()) %>%arrange(desc(number_of_members))kable(club_distribution)```As we can see from the table the biggest club in the parliament right now is PiS with 192 MPs.# What is the distribution of the MPs per voivodeships?```{R}voivodeship_distribution <- mp %>%group_by(voivodeship, club) %>%summarise(number_of_members =n()) %>%arrange(voivodeship, desc(number_of_members))per_voivodeship <- voivodeship_distribution %>%group_by(voivodeship) %>%summarise(number_of_members =sum(number_of_members))ggplot(per_voivodeship, aes(x="", y=number_of_members, fill=voivodeship)) +geom_bar(stat="identity") +coord_polar("y")```# What is the distribution of the MPs per voivodeship with regard to the club?```{R}ggplot(voivodeship_distribution, aes(fill=club, y=number_of_members, x=club)) +geom_bar(position="dodge", stat="identity") +scale_x_discrete(labels =NULL, breaks =NULL) +labs(x =NULL) +facet_wrap(~voivodeship)```# How does the distribution of education level of the Polish MPs look like?```{R}ggplot(mp, aes(x=educationLevel)) +geom_bar()```# How does the distribution of professions of the Polish MPs look like?```{R}professions <- mp %>%group_by(profession) %>%summarize(number =n()) %>%filter(!is.na(profession)) %>%arrange(desc(number))datatable(professions, filter ="top")p <- professions %>%filter(number >1)ggplot(p, aes(x=profession, y=number)) +geom_segment(aes(x=profession, xend=profession, y=0, yend=number)) +theme(axis.text.x =element_text(size =8, angle =45, hjust =1))```# Are there any inactive MP? What are the causes of it?```{R}not_active <- mp %>%filter(!active) %>%select(firstName, secondName, lastName, club, inactiveCause, waiverDesc)kable(not_active)```# Visualization of number of votes per MP using treemap```{R, out.width="100%"}treemap <- mp %>%mutate(secondName =ifelse(is.na(secondName), "", secondName))ggplot(treemap, aes(area=numberOfVotes, fill=club, label=paste(firstName, secondName, lastName), subgroup=club)) +geom_treemap() +geom_treemap_subgroup_border() +geom_treemap_text(min.size=1) +theme_void()```## What is the distribution of number of votes per club?```{R}ggplot(mp, aes(x=club, y=numberOfVotes, fill=club)) +geom_boxplot() +theme_classic() +theme(legend.position="none")```# Visualisation using calendar for birthdays```{R, out.width="100%"}# https://r-charts.com/evolution/calendar-heatmap/birthdays_per_day <- mp %>%group_by(month =month(birthDate), day =day(birthDate)) %>%summarise(number=n())birthdays_per_day <- birthdays_per_day %>%mutate(date =as.Date(paste("2024", month, day, sep="-"))) %>%mutate(yday =yday(date))all_days_df <-data.frame(yday =1:366)merged_df <-merge(all_days_df, birthdays_per_day, by ="yday", all.x =TRUE)merged_df$number[is.na(merged_df$number)] <-0calendR(year =2024,special.days = merged_df$number,gradient =TRUE,low.col ="#FCFFDD",special.col ="#00AAAE",legend.pos ="bottom",day.size=3.5,title.size =0,weeknames.size=2)```------------------------------------------------------------------------# Introducing Voting DataThe Polish Parliament API provides the data on all the voting, including the votes of a single MP. It provides a great opportunity to statistically analyze, and visualize the processes behind most important decisions in Poland.# Loading the Data```{R}# Two votings types: Yes/No; on list.# Ignoring on list, as not significant number of entries.votings_path <-"./Data/votings/"# Yes/No votingsy_n_votings <-read.csv(paste0(votings_path, "y_n_votings.csv"))```# Cleaning the dataWe are dropping yes column, as it contains number of yes `votes`, `votes.firstName`, `votes.lastName`, `votes.secondName`, `votes.club`, `term`, `notParticipating`, `totalVoted`, and `no` as those data is redundant or is not relevant. We would also drop `kind` as it has `{R} length(unique(y_n_votings$kind))` unique value.```{R}# Yes/No votingsy_n_votings <- y_n_votings %>%select(!(c(yes, notParticipating, votes.firstName, votes.lastName, votes.secondName, votes.club, term, totalVoted, no, kind)))y_n_votings$date <-as.POSIXct(y_n_votings$date, format ="%Y-%m-%dT%H:%M:%S")```# Description of the DataThe dataset after cleaning has the following columns:```{R}names(y_n_votings)````abstain`: 1 if MP abstained during voting, 0 otherwise`date`: Date of the voting`sitting`: Number of sitting`sittingDay`: Day of the sitting`title`: Title of the voting`votes.MP`: ID of the MP`votes.vote`: Vote of the MP`votingNumber`: Number of voting`topic`: Topic of the voting`description`: Description of the votingSessions of Polish Parliament are divided into points, and votings. In this data, `title` column regards the point of the session, when `topis`, and `description` regard voting.# Number of votings per sitting, with days distinction```{R}votings_per_sitting <- y_n_votings %>%group_by(sitting, sittingDay) %>%summarise(number_of_votings =max(votingNumber))# Plot dataggplot(votings_per_sitting, aes(x =factor(sitting),y = number_of_votings,fill =factor(sittingDay))) +geom_bar(stat ="identity", position ="dodge") +labs(title ="Number of Votings per Sitting with Days Distinction",x ="Sitting Number",y ="Number of Votings",fill ="Sitting Day") +theme_minimal()```# Votings heatmap on calendar```{R}# https://r-charts.com/evolution/calendar-heatmap/votings_per_day <- y_n_votings %>%group_by(date) %>%summarize(number =max(votingNumber))votings_per_day <- votings_per_day %>%mutate(yday =yday(date)) %>%group_by(yday) %>%summarize(number =max(number))all_days_df <-data.frame(yday =1:367)merged_df <-merge(all_days_df, votings_per_day, by ="yday", all.x =TRUE)merged_df <- merged_df %>%mutate(yday =ifelse(yday <=317, yday, yday-317))merged_df$number[is.na(merged_df$number)] <-0calendR(year =2024,special.days = merged_df$number,from=as.Date("2023-11-13"),to=as.Date("2024-11-13"),gradient =TRUE,low.col ="#FCFFDD",special.col ="#00AAAE",legend.pos ="bottom",day.size=3.5,title.size =0,weeknames.size=2)```# Voting heatmap days and time```{R}y_n_votings <- y_n_votings %>%mutate(day_of_week =wday(date, label =TRUE, week_start =1),hour =hour(date))votings_per_day_time <- y_n_votings %>%group_by(day_of_week, hour) %>%summarise(number_of_votings =max(votingNumber)) %>%ungroup()ggplot(votings_per_day_time, aes(x = hour, y = day_of_week,fill = number_of_votings)) +geom_tile(color ="white") +scale_fill_gradient(low ="yellow", high ="red") +labs(title ="Heatmap of Votings by Day and Time",x ="Hour of Day",y ="Day of Week",fill ="Number of Votings") +theme_minimal()```# Percentage of abstained votes heatmap per club```{R}merged_data <- y_n_votings %>%left_join(mp, by =c("votes.MP"="id")) %>%left_join(clubs, by =c("club"="id"))abstained_votes_percentage <- merged_data %>%group_by(club) %>%summarise(total_votes =n(),abstained_votes =sum(abstain==1)) %>%mutate(percentage_abstained = (abstained_votes / total_votes) *100) %>%ungroup()ggplot(abstained_votes_percentage, aes(x = club, y = percentage_abstained)) +geom_bar(stat ="identity", fill ="skyblue", width =0.5) +labs(title ="Percentage of Abstained Votes by Club",x ="Club",y ="Percentage of Abstained Votes") +theme(axis.text.x =element_text(angle =90, vjust =0.5, hjust=1))```# Most abstaining MPs```{R}abstaining <- y_n_votings %>%inner_join(mp, by =c("votes.MP"="id")) %>%group_by(firstName, secondName, lastName, club) %>%summarise(Abstain_Count =sum(abstain==1),Total_Votes =n(),Abstain_Rate =round(sum(abstain==1) /n() *100, 2)) %>%arrange(desc(Abstain_Rate))datatable(abstaining, filter ="top")```# Least voting MPs```{R}least_voting <- y_n_votings %>%inner_join(mp, by =c("votes.MP"="id")) %>%group_by(firstName, secondName, lastName, club) %>%summarise(Vote_Count =n()) %>%arrange(Vote_Count)datatable(least_voting, filter ="top")```# Network Analysis of the DataLet's create a function to filter data for a specific MP```{R}filter_mp_data <-function(mp_id, mp_data, voting_data) { mp <- mp_data[mp_data$id == mp_id, ] votes <- voting_data[voting_data$`votes.MP`== mp_id, ]return(list(mp = mp, votes = votes))}```Let's create a function to calculate the similarity of two MPs```{R}calculate_similarity <-function(mp1_votes, mp2_votes) { merged_votes <-merge(mp1_votes, mp2_votes, by =c("sitting", "votingNumber"),suffixes =c("_mp1", "_mp2"))# Calculate similarity score (percentage of matching votes) total_votes <-nrow(merged_votes)if (total_votes ==0) {return(0) # No common votes } matching_votes <-sum(merged_votes$votes.vote_mp1 == merged_votes$votes.vote_mp2, na.rm =TRUE) similarity_score <- matching_votes / total_votesreturn(similarity_score)}```We want to create the networks around leaders of the clubs```{R}leaders_ids <-c(148, 400, 241, 133, 174, 58)```## Similarity network graphs```{R}build_similarity_network <-function(mp_data, voting_data, threshold =0.6) { mp_ids <-unique(mp_data$id) n_mps <-length(mp_ids) similarity_matrix <-matrix(0, nrow = n_mps, ncol = n_mps)rownames(similarity_matrix) <- mp_idscolnames(similarity_matrix) <- mp_idsfor (i in1:(n_mps -1)) {for (j in (i +1):n_mps) { mp1_data <-filter_mp_data(mp_ids[i], mp_data, voting_data) mp2_data <-filter_mp_data(mp_ids[j], mp_data, voting_data) similarity_score <-calculate_similarity(mp1_data$votes, mp2_data$votes) similarity_matrix[i, j] <- similarity_score similarity_matrix[j, i] <- similarity_score } } similarity_matrix <- (similarity_matrix +t(similarity_matrix)) /2 graph <-graph_from_adjacency_matrix(similarity_matrix, mode ="undirected", weighted =TRUE,diag =FALSE) graph <-delete_edges(graph, E(graph)[E(graph)$weight < threshold])# Print diagnostic informationprint(paste("Number of vertices in graph:", vcount(graph)))print(paste("Number of rows in mp_data:", nrow(mp_data)))print(paste("Number of unique IDs in mp_data:", length(unique(mp_data$id))))# Check if all graph vertex names are in mp_data$id missing_ids <-setdiff(V(graph)$name, mp_data$id)if (length(missing_ids) >0) {warning("Some vertex IDs are not present in mp_data$id")print(paste("Number of missing IDs:", length(missing_ids)))print("First few missing IDs:")print(head(missing_ids)) }# Create a data frame for vertex attributes, handling potential mismatches vertex_attr <-data.frame(name =V(graph)$name,stringsAsFactors =FALSE )# Match attributes, using NA for any unmatched IDs vertex_attr$club <- mp_data$club[match(vertex_attr$name, mp_data$id)] vertex_attr$firstLastName <- mp_data$firstLastName[match(vertex_attr$name, mp_data$id)]# Add attributes to the graph graph <-set_vertex_attr(graph, "club", value = vertex_attr$club) graph <-set_vertex_attr(graph, name ="firstLastName",value = vertex_attr$firstLastName)# Print diagnostic information about attributesprint(paste("Number of vertices with 'club' attribute:",sum(!is.na(vertex_attr$club))))print(paste("Number of vertices with 'firstLastName' attribute:",sum(!is.na(vertex_attr$firstLastName))))return(graph)}``````{R}# Function to assign colors to clubsassign_club_colors <-function(graph) { clubs <-unique(vertex_attr(graph, "club")) clubs <- clubs[!is.na(clubs) & clubs !=""] n_clubs <-length(clubs)if (n_clubs >0) { color_palette <-brewer.pal(max(3, min(n_clubs, 9)), "Set1") club_colors <-setNames(color_palette[1:min(n_clubs, 9)], clubs)V(graph)$color <- club_colors[vertex_attr(graph, "club")] } else {V(graph)$color <-"lightblue" }V(graph)$color[is.na(V(graph)$color) |V(graph)$color ==""] <-"gray"return(list(graph = graph, club_colors = club_colors))}# Function to set leader labelsset_leader_labels <-function(graph, leaders_ids) {V(graph)$label <-rep(NA, vcount(graph)) leader_indices <-which(V(graph)$name %in%as.character(leaders_ids))if (length(leader_indices) >0&&"firstLastName"%in%vertex_attr_names(graph)) { leader_names <-vertex_attr(graph, "firstLastName")[leader_indices] valid_names <-!is.na(leader_names) & leader_names !=""V(graph)$label[leader_indices[valid_names]] <- leader_names[valid_names] }return(list(graph = graph, leader_indices = leader_indices))}# Function to plot the graphplot_network <-function(graph, layout, leader_indices) {par(mar =c(1, 1, 2, 1), bg ="gray95")plot(graph, vertex.label =NA,edge.width =E(graph)$weight *0.4,edge.color =adjustcolor("gray40", alpha.f =0.4),edge.curved =0.2,layout = layout,rescale =TRUE,asp =0,main ="Network Visualization")grid(nx =NULL, ny =NULL, col ="gray90", lty ="dotted")}# Function to add leader labelsadd_leader_labels <-function(graph, layout, leader_indices) {for (i inseq_along(leader_indices)) { idx <- leader_indices[i]if (!is.na(V(graph)$label[idx])) { angle <-2* pi * i /length(leader_indices) x_offset <-cos(angle) *0.15 y_offset <-sin(angle) *0.15 coords <- layout[idx,]arrows(coords[1], coords[2], coords[1] + x_offset, coords[2] + y_offset, length =0.1, col ="black", lwd =1.5)text(coords[1] + x_offset *1.2, coords[2] + y_offset *1.2, V(graph)$label[idx], cex =0.9, font =2) } }}# Function to add legendadd_legend <-function(club_colors) { legend_colors <-c(club_colors, "black") legend_names <-c(names(club_colors), "Leaders") legend_pch <-c(rep(19, length(club_colors)), 19) # Changed to 19 for leaderslegend("topright", legend = legend_names, col = legend_colors, pch = legend_pch, pt.cex =1, title ="Clubs and Leaders", bty ="n", cex =0.7,ncol =2, x.intersp =0.5, y.intersp =0.7)}# Main visualization functionvisualize_network <-function(graph, leaders_ids) {# Assign colors and set labels color_result <-assign_club_colors(graph) graph <- color_result$graph club_colors <- color_result$club_colors label_result <-set_leader_labels(graph, leaders_ids) graph <- label_result$graph leader_indices <- label_result$leader_indices# Adjust node sizesV(graph)$size <-3V(graph)$size[leader_indices] <-8# Calculate layout layout <-layout_with_kk(graph)# Plot the graphplot_network(graph, layout, leader_indices)# Add leader labelsadd_leader_labels(graph, layout, leader_indices)# Add legendadd_legend(club_colors)}```## Visualization```{R}graph <-build_similarity_network(mp, y_n_votings)visualize_network(graph, leaders_ids)```## Most similar voters```{R}mp <- mp %>%mutate(id =as.character(id))most_similar <-as_data_frame(graph, what ="edges") %>%arrange(desc(weight)) %>%head(10)most_similar <- most_similar %>%left_join(mp %>%select(id, firstName, lastName, club),by =c("from"="id")) %>%left_join(mp %>%select(id, firstName, lastName, club),by =c("to"="id"),suffix =c("_from", "_to"))datatable(most_similar, filter ="top")```The above tables with network visualize the connections between the MPs regarding their voting decisions in the parliament. It is left for the reader to interpret those information.# SummaryThis project aimed to empower voters by providing a deeper understanding of political bodies through a detailed analysis of Members of Parliament and their voting patterns. By offering insights into general statistics and performing network analysis on voting behaviors, the project equips individuals with the knowledge needed to make more informed and thoughtful political decisions. Through data-driven insights, it promotes transparency and fosters greater civic engagement in the political process.