# data pulled from: https://www.advancedsportsanalytics.com/pga-overview
# look at var names and info
#names(pga)
#str(pga)
#head(pga)
# drop the empty variables
pga$`Unnamed: 2`<-NULL
pga$`Unnamed: 3`<-NULL
pga$`Unnamed: 4`<-NULL
# look for other NAs
pga[rowSums(is.na(pga))==0,]
## # A tibble: 17,008 × 34
## Player_initial_last `tournament id` `player id` hole_par strokes hole_DKP
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A. Ancer 401025267 9261 280 276 59.5
## 2 A. Cook 401025267 9517 280 270 76
## 3 A. Hadwin 401025267 5548 280 268 77
## 4 A. Landry 401025267 4682 280 272 66.5
## 5 A. Noren 401025267 3832 280 269 74.5
## 6 A. Putnam 401025267 5502 280 267 74.5
## 7 A. Scott 401025267 388 280 276 69
## 8 A. Wise 401025267 10577 280 267 83.5
## 9 B. An 401025267 5285 280 270 70
## 10 B. DeChambeau 401025267 10046 280 268 77
## # ℹ 16,998 more rows
## # ℹ 28 more variables: hole_FDP <dbl>, hole_SDP <dbl>, streak_DKP <dbl>,
## # streak_FDP <dbl>, streak_SDP <dbl>, n_rounds <dbl>, made_cut <dbl>,
## # pos <dbl>, finish_DKP <dbl>, finish_FDP <dbl>, finish_SDP <dbl>,
## # total_DKP <dbl>, total_FDP <dbl>, total_SDP <dbl>, player <chr>,
## # `tournament name` <chr>, course <chr>, date <chr>, purse <dbl>,
## # season <dbl>, no_cut <dbl>, Finish <chr>, sg_putt <dbl>, sg_arg <dbl>, …
# subset to numeric vars and describe
numVARS<-c("hole_par",
"strokes",
"hole_DKP",
"hole_FDP",
"hole_SDP",
"streak_DKP",
"streak_FDP",
"streak_SDP",
"pos",
"finish_DKP",
"finish_FDP",
"finish_SDP",
"total_DKP",
"total_FDP",
"total_SDP",
"sg_putt",
"sg_arg",
"sg_app",
"sg_ott",
"sg_t2g",
"sg_total"
)
numITEMS<-pga[numVARS]
numITEMS %>%
summarise(across(everything(), ~ sum(is.na(.x))))
## # A tibble: 1 × 21
## hole_par strokes hole_DKP hole_FDP hole_SDP streak_DKP streak_FDP streak_SDP
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0 0
## # ℹ 13 more variables: pos <int>, finish_DKP <int>, finish_FDP <int>,
## # finish_SDP <int>, total_DKP <int>, total_FDP <int>, total_SDP <int>,
## # sg_putt <int>, sg_arg <int>, sg_app <int>, sg_ott <int>, sg_t2g <int>,
## # sg_total <int>
pga<-pga %>%
filter_at(vars(sg_putt, sg_arg, sg_app, sg_ott, sg_t2g, sg_total), all_vars(!is.na(.)))
numITEMS<-numITEMS %>%
filter_at(vars(sg_putt, sg_arg, sg_app, sg_ott, sg_t2g, sg_total), all_vars(!is.na(.)))
kable(describe(numITEMS),
format='markdown',
digits=3)
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
hole_par | 1 | 30495 | 222.691 | 70.548 | 280.00 | 225.553 | 11.861 | 70.00 | 292.00 | 222.00 | -0.314 | -1.794 | 0.404 |
strokes | 2 | 30495 | 221.283 | 66.935 | 271.00 | 223.450 | 26.687 | 66.00 | 313.00 | 247.00 | -0.307 | -1.762 | 0.383 |
hole_DKP | 3 | 30495 | 49.507 | 24.115 | 52.00 | 48.764 | 32.617 | -2.50 | 174.00 | 176.50 | 0.255 | -0.647 | 0.138 |
hole_FDP | 4 | 30495 | 43.821 | 24.176 | 44.40 | 43.168 | 31.283 | -21.40 | 134.70 | 156.10 | 0.185 | -0.995 | 0.138 |
hole_SDP | 5 | 30495 | 48.724 | 22.281 | 54.00 | 48.801 | 31.135 | -11.00 | 107.00 | 118.00 | -0.066 | -1.412 | 0.128 |
streak_DKP | 6 | 30495 | 1.724 | 2.824 | 0.00 | 1.112 | 0.000 | 0.00 | 23.00 | 23.00 | 2.089 | 5.242 | 0.016 |
streak_FDP | 7 | 30495 | 7.550 | 7.214 | 6.40 | 6.612 | 8.303 | 0.00 | 43.60 | 43.60 | 0.937 | 0.276 | 0.041 |
streak_SDP | 8 | 30495 | 1.633 | 2.629 | 0.00 | 1.058 | 0.000 | 0.00 | 22.00 | 22.00 | 1.867 | 3.988 | 0.015 |
pos | 9 | 17008 | 34.141 | 21.398 | 32.00 | 33.428 | 26.687 | 1.00 | 91.00 | 90.00 | 0.225 | -1.080 | 0.164 |
finish_DKP | 10 | 30495 | 2.363 | 4.725 | 0.00 | 1.196 | 0.000 | 0.00 | 30.00 | 30.00 | 3.138 | 11.563 | 0.027 |
finish_FDP | 11 | 30495 | 2.022 | 4.664 | 0.00 | 0.810 | 0.000 | 0.00 | 30.00 | 30.00 | 3.430 | 13.267 | 0.027 |
finish_SDP | 12 | 30495 | 1.107 | 2.823 | 0.00 | 0.307 | 0.000 | 0.00 | 15.00 | 15.00 | 3.097 | 9.361 | 0.016 |
total_DKP | 13 | 30495 | 53.593 | 28.903 | 54.00 | 51.666 | 37.065 | -2.50 | 205.50 | 208.00 | 0.501 | -0.342 | 0.166 |
total_FDP | 14 | 30495 | 53.393 | 33.321 | 49.90 | 51.094 | 39.289 | -21.40 | 202.60 | 224.00 | 0.525 | -0.448 | 0.191 |
total_SDP | 15 | 30495 | 51.464 | 25.325 | 55.00 | 50.654 | 35.582 | -11.00 | 141.00 | 152.00 | 0.165 | -1.103 | 0.145 |
sg_putt | 16 | 30495 | -0.120 | 1.118 | -0.04 | -0.075 | 1.038 | -5.99 | 4.43 | 10.42 | -0.449 | 0.808 | 0.006 |
sg_arg | 17 | 30495 | -0.040 | 0.725 | 0.00 | -0.015 | 0.638 | -6.43 | 3.17 | 9.60 | -0.509 | 1.963 | 0.004 |
sg_app | 18 | 30495 | -0.100 | 1.116 | 0.00 | -0.045 | 1.023 | -9.25 | 4.67 | 13.92 | -0.608 | 1.418 | 0.006 |
sg_ott | 19 | 30495 | -0.045 | 0.809 | 0.05 | 0.015 | 0.697 | -7.74 | 2.77 | 10.51 | -1.149 | 3.739 | 0.005 |
sg_t2g | 20 | 30495 | -0.184 | 1.636 | -0.01 | -0.080 | 1.468 | -13.95 | 6.30 | 20.25 | -0.814 | 1.784 | 0.009 |
sg_total | 21 | 30495 | -0.301 | 1.961 | -0.16 | -0.187 | 1.794 | -13.67 | 8.52 | 22.19 | -0.714 | 1.432 | 0.011 |
courseVARS<-c(
"course",
"sg_putt",
"sg_arg",
"sg_app",
"sg_ott",
"sg_t2g",
"sg_total"
)
courseITEMS<-pga[courseVARS]
datwidenoyr <- courseITEMS %>% group_by(course) %>%
mutate(
avg_sg_putt = mean(sg_putt,na.rm=T),
avg_sg_arg = mean(sg_arg,na.rm=T),
avg_sg_app = mean(sg_app,na.rm=T),
avg_sg_ott = mean(sg_ott,na.rm=T),
avg_sg_t2g = mean(sg_t2g,na.rm=T),
avg_sg_total = mean(sg_total,na.rm=T))
aggVARS<-c(
"course",
"avg_sg_putt",
"avg_sg_arg",
"avg_sg_app",
"avg_sg_ott",
"avg_sg_t2g",
"avg_sg_total"
)
datwidenoyrclean<-datwidenoyr[aggVARS]
datwidenoyrclean<-distinct(datwidenoyrclean)
#### by year
pga$season<-as.factor(pga$season)
courseVARS<-c(
"course",
"season",
"sg_putt",
"sg_arg",
"sg_app",
"sg_ott",
"sg_t2g",
"sg_total"
)
courseITEMS<-pga[courseVARS]
datwideyr <- courseITEMS %>% group_by(course, season) %>%
mutate(
avg_sg_putt = mean(sg_putt,na.rm=T),
avg_sg_arg = mean(sg_arg,na.rm=T),
avg_sg_app = mean(sg_app,na.rm=T),
avg_sg_ott = mean(sg_ott,na.rm=T),
avg_sg_t2g = mean(sg_t2g,na.rm=T),
avg_sg_total = mean(sg_total,na.rm=T))
aggVARS<-c(
"course",
"season",
"avg_sg_putt",
"avg_sg_arg",
"avg_sg_app",
"avg_sg_ott",
"avg_sg_t2g",
"avg_sg_total"
)
datwideyrclean<-datwideyr[aggVARS]
datwideyrclean<-distinct(datwideyrclean)
The metric strokes gained was developed by Columbia business professor Mark Broadie and was a massive advancement in golf analytics: https://www8.gsb.columbia.edu/researcharchive/articles/4996. Strokes Gained measures the quality of each shot based on its starting and ending locations. So, calculations factor in the average number of shots for a golfer to get down from a given starting distance and starting lie condition and the average number of shots to get down from a given end distance and end lie condition. These average numbers are different for professional vs. amateur golfers. Negative numbers represent how many shots a golfer has lost compared to the benchmark, while positive numbers represent how many shots have been gained against said benchmark. Negative numbers highlight areas for improvement, while positive numbers indicate the golfer has performed well.
datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_putt),]
head(datwidenoyrclean$course)
## [1] "Torrey Pines North - La Jolla, CA"
## [2] "Sea Island Resort - Sea Island, GA"
## [3] "Country Club of Jackson - Jackson, MS"
## [4] "Glen Oaks Club - Old Westbury, NY"
## [5] "Eagle Point Golf Club - Wilmington, NC"
## [6] "Memorial Park GC - Houston, TX"
ggplot(datwidenoyrclean, aes(avg_sg_putt, reorder(course, avg_sg_putt))) +
geom_point() + xlab("Avg SG Putting") + ylab("Course") + theme_minimal()
datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_arg),]
head(datwidenoyrclean$course)
## [1] "Kiawah Island Golf Resort - Kiawah Island, SC"
## [2] "Winged Foot GC - Mamaroneck, NY"
## [3] "Glen Oaks Club - Old Westbury, NY"
## [4] "Congaree Golf Club - Ridgeland, SC"
## [5] "Ridgewood Country Club - Paramus, NJ"
## [6] "Muirfield Village Golf Club - Dublin, OH"
ggplot(datwidenoyrclean, aes(avg_sg_arg, reorder(course, avg_sg_arg))) +
geom_point() + xlab("Avg SG ARG") + ylab("Course") + theme_minimal()
datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_app),]
head(datwidenoyrclean$course)
## [1] "Congaree Golf Club - Ridgeland, SC"
## [2] "Kiawah Island Golf Resort - Kiawah Island, SC"
## [3] "Muirfield Village Golf Club - Dublin, OH"
## [4] "TPC Sawgrass - Ponte Vedra Beach, FL"
## [5] "PGA National - Palm Beach Gardens, FL"
## [6] "TPC Twin Cities - Blaine, MN"
ggplot(datwidenoyrclean, aes(avg_sg_app, reorder(course, avg_sg_app))) +
geom_point() + xlab("Avg SG App") + ylab("Course") + theme_minimal()
datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_ott),]
head(datwidenoyrclean$course)
## [1] "The Concession Golf Club - Bradenton, FL"
## [2] "Old White Course - White Sulphur Springs, WV"
## [3] "Golf Club of Houston - Houston, TX"
## [4] "Liberty National - Jersey City, NJ"
## [5] "TPC Sawgrass - Ponte Vedra Beach, FL"
## [6] "Congaree Golf Club - Ridgeland, SC"
ggplot(datwidenoyrclean, aes(avg_sg_ott, reorder(course, avg_sg_ott))) +
geom_point() + xlab("Avg SG OTT") + ylab("Course") + theme_minimal()
datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_total),]
head(datwidenoyrclean$course)
## [1] "Congaree Golf Club - Ridgeland, SC"
## [2] "TPC Sawgrass - Ponte Vedra Beach, FL"
## [3] "Torrey Pines North - La Jolla, CA"
## [4] "Kiawah Island Golf Resort - Kiawah Island, SC"
## [5] "Muirfield Village Golf Club - Dublin, OH"
## [6] "Glen Oaks Club - Old Westbury, NY"
ggplot(datwidenoyrclean, aes(avg_sg_total, reorder(course, avg_sg_total))) +
geom_point() + xlab("Avg SG Total") + ylab("Course") + theme_minimal()
tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_putt, group = course)) +
geom_line() + guides(colour=FALSE) + xlab("Season") +
ylab("Avg SG Putt")
spag1 = tspag + aes(colour = factor(course)) + theme_bw()
tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_arg, group = course)) +
geom_line() + guides(colour=FALSE) + xlab("Season") +
ylab("Avg SG Around the Green")
spag2 = tspag + aes(colour = factor(course)) + theme_bw()
tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_app, group = course)) +
geom_line() + guides(colour=FALSE) + xlab("Season") +
ylab("Avg SG Approach")
spag3 = tspag + aes(colour = factor(course)) + theme_bw()
tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_ott, group = course)) +
geom_line() + guides(colour=FALSE) + xlab("Season") +
ylab("Avg SG Off the Tee")
spag4 = tspag + aes(colour = factor(course)) + theme_bw()
tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_t2g, group = course)) +
geom_line() + guides(colour=FALSE) + xlab("Season") +
ylab("Avg SG Tee to Green")
spag5 = tspag + aes(colour = factor(course)) + theme_bw()
tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_total, group = course)) +
geom_line() + guides(colour=FALSE) + xlab("Season") +
ylab("Avg SG Total")
spag6 = tspag + aes(colour = factor(course)) + theme_bw()
library(gridExtra)
grid.arrange(spag1, spag2, spag3, spag4, spag5, spag6, nrow = 2,
top = "Strokes Gained by Course over Time"
)
# count players' appearances
app<-as.data.frame(table(pga$Player_initial_last))
app<-app[ order(-app$Freq), ]
head(app)
## Var1 Freq
## 70 B. Stuard 182
## 432 T. Hoge 175
## 126 D. Lee 174
## 320 N. Taylor 173
## 51 B. Harman 171
## 440 T. Merritt 169
kable(describe(app$Freq), format='markdown',
digits=3)
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
X1 | 1 | 482 | 63.268 | 50.865 | 56.5 | 59.759 | 65.976 | 1 | 182 | 181 | 0.384 | -1.114 | 2.317 |
ggplot(app, aes(x=Freq))+
geom_histogram(color="#FFFFFF", fill="#003C80")+
scale_x_continuous(breaks = seq(0, 200, by = 10))+
scale_y_continuous(breaks = seq(0, 90, len = 10))+
labs(title="Player Tournament Appearances (2015-2022)",x="Frequency", y = "Count")+
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Brian Stuard appeared in the most PGA tournaments from 2015-2022. The average number of tournaments played in was 63. Many players performed in fewer than 20 tournaments.
winners <- pga[ which(pga$pos==1), ]
wincount<-winners %>% count(Player_initial_last)
wincount<-wincount[ order(-wincount$n), ]
ggplot(winners, aes(x = fct_infreq(Player_initial_last)))+
geom_bar(color="#FFFFFF", fill="#CCA600")+
labs(title="Player Tournament Wins (2015-2022)",x="Player", y = "Count")+
theme_minimal()+
theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))
Dustin Johnson and Justin Thomas had the most tournament wins in this time period.
library(gghighlight)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
winby<-as.data.frame(table(winners$Player_initial_last, winners$season))
winby <- winby %>%
rename("player" = "Var1",
"season" = "Var2",
"wins" = "Freq")
p2 <- winby %>%
ggplot(aes(x = season, y = wins, group = player)) +
labs(x= "Season", y= "Wins") +
ggtitle("Tournament Wins Over Time") +
geom_line() +
gghighlight(player == "R. McIlroy")+
theme_minimal()
## label_key: player
p2
streakVARS<-c(
"streak_DKP",
"streak_FDP",
"streak_SDP"
)
streakITEMS<-pga[streakVARS]
kable(describe(streakITEMS), format='markdown',
digits=3)
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
streak_DKP | 1 | 30495 | 1.724 | 2.824 | 0.0 | 1.112 | 0.000 | 0 | 23.0 | 23.0 | 2.089 | 5.242 | 0.016 |
streak_FDP | 2 | 30495 | 7.550 | 7.214 | 6.4 | 6.612 | 8.303 | 0 | 43.6 | 43.6 | 0.937 | 0.276 | 0.041 |
streak_SDP | 3 | 30495 | 1.633 | 2.629 | 0.0 | 1.058 | 0.000 | 0 | 22.0 | 22.0 | 1.867 | 3.988 | 0.015 |
## plot by cut
## scale strokes by round
pga$strokesperround<-(pga$strokes/pga$n_rounds)
pga$made_cut<-as.factor(pga$made_cut)
ggplot(pga, aes(x=made_cut, y=strokesperround, fill=made_cut)) +
geom_boxplot()+
theme_bw()+
labs(title="Stokes per Round by Cut", y = "Strokes per Round", x = "")+
scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
theme(legend.position="bottom")
## strokes and par
ggplot(pga, aes(x=hole_par, y=strokes)) +
geom_point()+
geom_smooth(method=lm)+
labs(y = "Total Strokes", x = "Total Hole Par")+
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
## correlation matrix of numeric items
res <- cor(numITEMS, use="pairwise.complete.obs")
library(corrplot)
## corrplot 0.92 loaded
corrplot(res, type = "lower", order = "hclust",
tl.col = "black", tl.srt = 45)
# distributions of strokes gained
p1<-ggplot(pga, aes(sg_putt))+
geom_histogram(color="darkgreen", fill="lightgreen")+
labs(x="Putt")+
theme_bw()
p2<-ggplot(pga, aes(sg_arg))+
geom_histogram(color="darkgreen", fill="lightgreen")+
labs(x="ARG")+
theme_bw()
p3<-ggplot(pga, aes(sg_app))+
geom_histogram(color="darkgreen", fill="lightgreen")+
labs(x="App")+
theme_bw()
p4<-ggplot(pga, aes(sg_ott))+
geom_histogram(color="darkgreen", fill="lightgreen")+
labs(x="OTT")+
theme_bw()
p5<-ggplot(pga, aes(sg_t2g))+
geom_histogram(color="darkgreen", fill="lightgreen")+
labs(x="t2g")+
theme_bw()
p6<-ggplot(pga, aes(sg_total))+
geom_histogram(color="darkgreen", fill="lightgreen")+
labs(x="Total")+
theme_bw()
library(gridExtra)
grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 2,
top = "Strokes Gained"
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sgVARS<-c(
"strokes",
"pos",
"sg_putt",
"sg_arg",
"sg_app",
"sg_ott",
"sg_t2g",
"sg_total"
)
sgITEMS<-pga[sgVARS]
kable(describe(sgITEMS), format='markdown',
digits=3)
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
strokes | 1 | 30495 | 221.283 | 66.935 | 271.00 | 223.450 | 26.687 | 66.00 | 313.00 | 247.00 | -0.307 | -1.762 | 0.383 |
pos | 2 | 17008 | 34.141 | 21.398 | 32.00 | 33.428 | 26.687 | 1.00 | 91.00 | 90.00 | 0.225 | -1.080 | 0.164 |
sg_putt | 3 | 30495 | -0.120 | 1.118 | -0.04 | -0.075 | 1.038 | -5.99 | 4.43 | 10.42 | -0.449 | 0.808 | 0.006 |
sg_arg | 4 | 30495 | -0.040 | 0.725 | 0.00 | -0.015 | 0.638 | -6.43 | 3.17 | 9.60 | -0.509 | 1.963 | 0.004 |
sg_app | 5 | 30495 | -0.100 | 1.116 | 0.00 | -0.045 | 1.023 | -9.25 | 4.67 | 13.92 | -0.608 | 1.418 | 0.006 |
sg_ott | 6 | 30495 | -0.045 | 0.809 | 0.05 | 0.015 | 0.697 | -7.74 | 2.77 | 10.51 | -1.149 | 3.739 | 0.005 |
sg_t2g | 7 | 30495 | -0.184 | 1.636 | -0.01 | -0.080 | 1.468 | -13.95 | 6.30 | 20.25 | -0.814 | 1.784 | 0.009 |
sg_total | 8 | 30495 | -0.301 | 1.961 | -0.16 | -0.187 | 1.794 | -13.67 | 8.52 | 22.19 | -0.714 | 1.432 | 0.011 |
res2 <- cor(sgITEMS, use="pairwise.complete.obs")
round(res2, 2)
## strokes pos sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## strokes 1.00 0.09 0.35 0.23 0.35 0.24 0.46 0.59
## pos 0.09 1.00 -0.43 -0.29 -0.48 -0.37 -0.67 -0.86
## sg_putt 0.35 -0.43 1.00 0.03 -0.01 -0.03 -0.01 0.55
## sg_arg 0.23 -0.29 0.03 1.00 0.04 -0.01 0.46 0.40
## sg_app 0.35 -0.48 -0.01 0.04 1.00 0.11 0.75 0.61
## sg_ott 0.24 -0.37 -0.03 -0.01 0.11 1.00 0.57 0.45
## sg_t2g 0.46 -0.67 -0.01 0.46 0.75 0.57 1.00 0.82
## sg_total 0.59 -0.86 0.55 0.40 0.61 0.45 0.82 1.00
library(corrplot)
corrplot(res2, type = "lower", order = "hclust",
tl.col = "black", tl.srt = 45)
Strokes gained total and tee-to-green are sums of other strokes gained categories.
library(sjPlot)
pga$playerid<-as.factor(pga$`player id`)
pga$tourid<-as.factor(pga$`tournament id`)
m1<- lmer(pos ~ sg_ott+sg_app+sg_arg+sg_putt+(1|playerid), data=pga)
#summary(m1)
tab_model(m1)
pos | |||
---|---|---|---|
Predictors | Estimates | CI | p |
(Intercept) | 45.10 | 44.76 – 45.43 | <0.001 |
sg ott | -13.21 | -13.50 – -12.93 | <0.001 |
sg app | -13.09 | -13.28 – -12.89 | <0.001 |
sg arg | -13.41 | -13.71 – -13.12 | <0.001 |
sg putt | -13.59 | -13.78 – -13.39 | <0.001 |
Random Effects | |||
σ2 | 112.47 | ||
τ00 playerid | 6.75 | ||
ICC | 0.06 | ||
N playerid | 426 | ||
Observations | 17008 | ||
Marginal R2 / Conditional R2 | 0.731 / 0.746 |
#check_model(m1)
#model_performance(m1)
Unsurprisingly, as the total hole par increases, so do the number of strokes. Also unsurprisingly, players who made the cut had fewer stokes per round. Only players that made the cut received a final position ranking. For players that made the cut, each category of strokes gained was negatively related to final position indicating that higher strokes gained corresponded a lower position (better performance). Strokes gained putting and approaching the green were the most strongly related to worse performance.
Additionally, the strokes gained categories were uncorrelated with each other indicating that they are independent. For example, strokes gained putting was not related to strokes gained on the approach. So, if a golfer is struggling in one area, that may be independent from performance in other areas.
After accounting for clustering within player, all categories of of strokes gained statistically predicted final position in players that made the cut.
Strokes gained while putting is often what we hear about in golf coverage and what my high school golf coach always emphasized. However, among players that made tournament cuts, these results indicate that which category of strokes gained is the most impactful for overall performance may vary player to player.
Importantly, since only players that made the cut get a position, these relationships could change in the full sample. Let’s take a look at strokes gained predicting performance but with making the cut (yes/no) as the outcome.
table(pga$made_cut)
##
## 0 1
## 12726 17769
p1<-ggplot(pga, aes(x=made_cut, y=sg_ott, fill=made_cut)) +
geom_boxplot()+
theme_bw()+
labs(y = "Stokes Gained Off the Tee", x = "")+
scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
theme(legend.position = "none")
p2<-ggplot(pga, aes(x=made_cut, y=sg_app, fill=made_cut)) +
geom_boxplot()+
theme_bw()+
labs(y = "Stokes Gained Approach", x = "")+
scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
theme(legend.position = "none")
p3<-ggplot(pga, aes(x=made_cut, y=sg_arg, fill=made_cut)) +
geom_boxplot()+
theme_bw()+
labs(y = "Stokes Gained Around the Green", x = "")+
scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
theme(legend.position = "none")
p4<-ggplot(pga, aes(x=made_cut, y=sg_putt, fill=made_cut)) +
geom_boxplot()+
theme_bw()+
labs(y = "Stokes Gained Putting", x = "")+
scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
theme(legend.position = "none")
p5<-ggplot(pga, aes(x=made_cut, y=sg_t2g, fill=made_cut)) +
geom_boxplot()+
theme_bw()+
labs(y = "Stokes Gained Tee-to-Green", x = "")+
scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
theme(legend.position = "none")
p6<-ggplot(pga, aes(x=made_cut, y=sg_total, fill=made_cut)) +
geom_boxplot()+
theme_bw()+
labs(y = "Stokes Gained Total", x = "")+
scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
theme(legend.position = "none")
plegend<-ggplot(pga, aes(x=made_cut, y=sg_total, fill=made_cut)) +
geom_boxplot()+
theme_bw()+
labs(y = "Stokes Gained Total", x = "")+
scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
theme(legend.position = "bottom")
extract_legend <- function(my_ggp) {
step1 <- ggplot_gtable(ggplot_build(my_ggp))
step2 <- which(sapply(step1$grobs, function(x) x$name) == "guide-box")
step3 <- step1$grobs[[step2]]
return(step3)
}
shared_legend <- extract_legend(plegend)
library(gridExtra)
grid.arrange(arrangeGrob(p1, p2, p3, p4, p5, p6, ncol = 3),
top = "Strokes Gained by Cut",
shared_legend, nrow = 2, heights = c(10, 1))
quallogit <- glmer(made_cut ~ sg_ott+sg_app+sg_arg+sg_putt+(1|playerid), data = pga, family = "binomial", control = glmerControl(optimizer = "bobyqa"),
nAGQ = 10)
print(quallogit, corr=FALSE)
## Generalized linear mixed model fit by maximum likelihood (Adaptive
## Gauss-Hermite Quadrature, nAGQ = 10) [glmerMod]
## Family: binomial ( logit )
## Formula: made_cut ~ sg_ott + sg_app + sg_arg + sg_putt + (1 | playerid)
## Data: pga
## AIC BIC logLik deviance df.resid
## 26873.48 26923.43 -13430.74 26861.48 30489
## Random effects:
## Groups Name Std.Dev.
## playerid (Intercept) 0.2274
## Number of obs: 30495, groups: playerid, 484
## Fixed Effects:
## (Intercept) sg_ott sg_app sg_arg sg_putt
## 0.7656 1.0290 1.0975 1.0835 1.1755
se <- sqrt(diag(vcov(quallogit)))
(tab <- cbind(Est = fixef(quallogit), LL = fixef(quallogit) - 1.96 * se, UL = fixef(quallogit) + 1.96 *
se))
## Est LL UL
## (Intercept) 0.7655784 0.7248984 0.8062585
## sg_ott 1.0289975 0.9841851 1.0738100
## sg_app 1.0975417 1.0629155 1.1321679
## sg_arg 1.0835290 1.0366248 1.1304332
## sg_putt 1.1754735 1.1404946 1.2104523
tab_model(quallogit)
made_cut | |||
---|---|---|---|
Predictors | Odds Ratios | CI | p |
(Intercept) | 2.15 | 2.06 – 2.24 | <0.001 |
sg ott | 2.80 | 2.68 – 2.93 | <0.001 |
sg app | 3.00 | 2.89 – 3.10 | <0.001 |
sg arg | 2.96 | 2.82 – 3.10 | <0.001 |
sg putt | 3.24 | 3.13 – 3.35 | <0.001 |
Random Effects | |||
σ2 | 3.29 | ||
τ00 playerid | 0.05 | ||
ICC | 0.02 | ||
N playerid | 484 | ||
Observations | 30495 | ||
Marginal R2 / Conditional R2 | 0.589 / 0.595 |
Similar to the restricted sample, each category of strokes gained independently statistically predicted if player performance indexed by making the tournament cut. Higher strokes gained corresponded a better chance of making the tournament.
jpeg("spag.jpeg", units = "in", width = 12, res=200)
grid.arrange(spag1, spag2, spag3, spag4, spag5, spag6, nrow = 2, top = "Strokes Gained by Course over Time")
dev.off()
## quartz_off_screen
## 2