Code
library(tidyr)
library(tidyverse)
library(gt)
library(rvest)
library(dplyr)
library(scales)
library(reactable)
library(janitor)
Note that his project is a work in progress. I will update as I have the time. I am creating an RShiny app which will allow users to view whichever of this data they want (for any league with expected goal data on FBRef for that season).
library(tidyr)
library(tidyverse)
library(gt)
library(rvest)
library(dplyr)
library(scales)
library(reactable)
library(janitor)
<- 10000 # number of times to simulate each match
n set.seed(2025) # for reproducibility
<- function(year){
get_data <- paste0("https://fbref.com/en/comps/22/", year, "/schedule/",
url "-Major-League-Soccer-Scores-and-Fixtures")
year, <- read_html(url)
html <- html |>
html_table html_element("table") |>
html_table()
head(html_table)
<- html_table |>
data ::clean_names() |>
janitorfilter(date != "Date" & score != "") |>
separate_wider_regex(score, patterns = c(home_score = "[0-9]+", "–",
away_score = "[0-9]+")) |>
select("date", "home", "home_xg" = "x_g", "home_score",
"away_score", "away_xg" = "x_g_2", "away") |>
mutate("home_pts" = ifelse(home_score > away_score, 3,
ifelse(home_score == away_score, 1, 0)),
"away_pts" = ifelse(away_score > home_score, 3,
ifelse(away_score == home_score, 1, 0))) |>
type_convert()
data
}
<- function(data){
xpts_table <- rep(NA, times = nrow(data))
home_xpts <- rep(NA, times = nrow(data))
away_xpts for(i in 1:nrow(data)){
<- rpois(n, data$home_xg[i])
home_sim <- rpois(n, data$away_xg[i])
away_sim <- (3 * sum(home_sim > away_sim) + sum(home_sim == away_sim))/n
home_xpts[i] <- (3 * sum(away_sim > home_sim) + sum(away_sim == home_sim))/n
away_xpts[i]
}
# combine table of results with simulated results
<- cbind(data, home_xpts, away_xpts)
data
<- data |>
xpts_table pivot_longer(cols = c("home", "away"), values_to = "team") |>
mutate("pts" = ifelse(name == "home", home_pts, away_pts),
"scored" = ifelse(name == "home", home_score, away_score),
"against" = ifelse(name == "home", away_score, home_score),
"xpts" = ifelse(name == "home", home_xpts, away_xpts),
"xg_for" = ifelse(name == "home", home_xg, away_xg),
"xg_ag" = ifelse(name == "home", away_xg, home_xg),
"pts_over_exp" = pts - xpts) |>
group_by(team) |>
summarize(gp = n(), across(pts:xg_ag, \(x) round(mean(x), 3)),
pts_over_exp = round(sum(pts_over_exp), 3)) |>
arrange(desc(xpts))
xpts_table }
# use reactable to allow for interactive sorting
<- get_data(2025)
mls_2025 <- xpts_table(mls_2025)
mls_xpts_2025 reactable(mls_xpts_2025, pagination = FALSE)
set.seed(2025)
<- 10000
n
<- function(team, data){
team_stats <- data |>
team_matches filter(home == team | away == team) |>
mutate(opponent = ifelse(home == team, str_c("vs ", away),
str_c("@ ", home)),
team_gf = ifelse(home == team, home_score, away_score),
team_ga = ifelse(home == team, away_score, home_score),
team_xgf = ifelse(home == team, home_xg, away_xg),
team_xga = ifelse(home == team, away_xg, home_xg),
res = ifelse(team_gf > team_ga, "W",
ifelse(team_gf == team_ga, "D", "L")),
result = str_c(team_gf, "-", team_ga, " ", res)) |>
select(date, opponent, result, team_xgf, team_xga)
<- rep(NA, times = nrow(team_matches))
team_pwin <- rep(NA, times = nrow(team_matches))
team_pdraw <- rep(NA, times = nrow(team_matches))
team_xpts
for(i in 1:nrow(team_matches)){
<- rpois(n, team_matches$team_xgf[i])
team_sim <- rpois(n, team_matches$team_xga[i])
opp_sim <- label_percent(accuracy = 0.01)(round(sum(team_sim > opp_sim) / n, 4))
team_pwin[i] <- label_percent(accuracy = 0.01)(round(sum(team_sim == opp_sim) / n, 4))
team_pdraw[i] <- round((3*sum(team_sim > opp_sim) +
team_xpts[i] sum(team_sim == opp_sim))/n, 3)
}<- cbind(team_matches, team_pwin, team_pdraw, team_xpts)
team_matches
|>
team_matches select("Date" = date, "Opponent" = opponent, "Result" = result,
"xGF" = team_xgf, "xGA" = team_xga, "xPts" = team_xpts,
"P Win" = team_pwin, "P Draw" = team_pdraw)
}
<- team_stats("Vancouver W'caps", mls_2025)
vwfc_stats gt(vwfc_stats)
Date | Opponent | Result | xGF | xGA | xPts | P Win | P Draw |
---|---|---|---|---|---|---|---|
2025-02-23 | @ Portland Timbers | 4-1 W | 3.0 | 0.7 | 2.590 | 82.42% | 11.70% |
2025-03-02 | vs LA Galaxy | 2-1 W | 1.9 | 0.6 | 2.260 | 68.66% | 19.98% |
2025-03-08 | vs CF Montréal | 2-0 W | 1.6 | 1.3 | 1.560 | 43.76% | 24.76% |
2025-03-15 | @ FC Dallas | 1-0 W | 0.8 | 0.2 | 1.865 | 47.89% | 42.85% |
2025-03-22 | vs Chicago Fire | 1-3 L | 1.2 | 2.2 | 0.813 | 20.28% | 20.42% |
2025-03-29 | @ Toronto FC | 0-0 D | 1.3 | 0.7 | 1.824 | 51.52% | 27.84% |
2025-04-05 | vs Colorado Rapids | 2-0 W | 2.0 | 0.3 | 2.532 | 78.99% | 16.22% |
2025-04-12 | vs Austin | 5-1 W | 4.5 | 0.7 | 2.861 | 94.07% | 3.92% |
2025-04-19 | @ St. Louis | 0-0 D | 1.1 | 1.0 | 1.424 | 37.32% | 30.40% |
2025-04-27 | @ Minnesota Utd | 3-1 W | 1.2 | 1.1 | 1.428 | 38.12% | 28.42% |
2025-05-03 | vs Real Salt Lake | 2-1 W | 2.9 | 1.6 | 2.093 | 64.28% | 16.47% |
2025-05-11 | vs LAFC | 2-2 D | 0.8 | 1.8 | 0.698 | 15.65% | 22.84% |
2025-05-17 | @ Austin | 0-0 D | 0.4 | 1.4 | 0.578 | 10.31% | 26.88% |
2025-05-24 | @ Real Salt Lake | 3-2 W | 3.0 | 1.0 | 2.465 | 77.76% | 13.25% |
2025-05-28 | vs Minnesota Utd | 0-0 D | 1.4 | 0.2 | 2.332 | 69.00% | 26.22% |
2025-06-08 | vs Seattle Sounders | 3-0 W | 1.9 | 0.6 | 2.282 | 69.34% | 20.22% |
2025-06-14 | @ Columbus Crew | 1-2 L | 0.7 | 1.2 | 0.933 | 20.69% | 31.23% |
2025-06-25 | vs San Diego FC | 3-5 L | 1.1 | 1.5 | 1.098 | 28.03% | 25.70% |
2025-06-29 | @ LAFC | 1-0 W | 0.7 | 0.9 | 1.154 | 26.55% | 35.73% |
2025-07-04 | @ LA Galaxy | 0-3 L | 0.5 | 1.4 | 0.703 | 14.19% | 27.73% |
2025-07-12 | @ Colorado Rapids | 0-3 L | 1.2 | 2.5 | 0.666 | 16.34% | 17.62% |
2025-07-16 | @ Houston Dynamo | 3-0 W | 2.0 | 0.8 | 2.171 | 65.51% | 20.61% |
2025-07-19 | @ San Diego FC | 1-1 D | 2.0 | 1.1 | 1.990 | 59.07% | 21.74% |
2025-07-26 | vs Sporting KC | 3-0 W | 2.5 | 1.6 | 1.903 | 57.11% | 18.97% |
<- function(team_stats){
weekly_plot = 1:nrow(team_stats)
match <- cbind(team_stats, match)
team_plot
<- function(x, n){
substrRight substr(x, nchar(x)-n+1, nchar(x))
}
plot(x = match, y = vwfc_stats$xPts,
bg = ifelse(substrRight(vwfc_stats$Result, 1) == "W", "darkgreen",
ifelse(substrRight(vwfc_stats$Result, 1) == "D", "yellow",
"red")), pch = 21)
abline(h = mean(team_stats$xPts), col = "blue", lwd = 2)
abline(h = sum(mls_xpts_2025$xpts * mls_xpts_2025$gp)/
sum(mls_xpts_2025$gp), col = "red", lwd = 2)
}
weekly_plot(vwfc_stats)