Analyzing and visualizing PGA tour data from 2015-2022

Data Import

Basic data cleaning and looking at descriptives

# data pulled from: https://www.advancedsportsanalytics.com/pga-overview

# look at var names and info
#names(pga)
#str(pga)
#head(pga)

# drop the empty variables
pga$`Unnamed: 2`<-NULL
pga$`Unnamed: 3`<-NULL
pga$`Unnamed: 4`<-NULL

# look for other NAs
pga[rowSums(is.na(pga))==0,]
## # A tibble: 17,008 × 34
##    Player_initial_last `tournament id` `player id` hole_par strokes hole_DKP
##    <chr>                         <dbl>       <dbl>    <dbl>   <dbl>    <dbl>
##  1 A. Ancer                  401025267        9261      280     276     59.5
##  2 A. Cook                   401025267        9517      280     270     76  
##  3 A. Hadwin                 401025267        5548      280     268     77  
##  4 A. Landry                 401025267        4682      280     272     66.5
##  5 A. Noren                  401025267        3832      280     269     74.5
##  6 A. Putnam                 401025267        5502      280     267     74.5
##  7 A. Scott                  401025267         388      280     276     69  
##  8 A. Wise                   401025267       10577      280     267     83.5
##  9 B. An                     401025267        5285      280     270     70  
## 10 B. DeChambeau             401025267       10046      280     268     77  
## # ℹ 16,998 more rows
## # ℹ 28 more variables: hole_FDP <dbl>, hole_SDP <dbl>, streak_DKP <dbl>,
## #   streak_FDP <dbl>, streak_SDP <dbl>, n_rounds <dbl>, made_cut <dbl>,
## #   pos <dbl>, finish_DKP <dbl>, finish_FDP <dbl>, finish_SDP <dbl>,
## #   total_DKP <dbl>, total_FDP <dbl>, total_SDP <dbl>, player <chr>,
## #   `tournament name` <chr>, course <chr>, date <chr>, purse <dbl>,
## #   season <dbl>, no_cut <dbl>, Finish <chr>, sg_putt <dbl>, sg_arg <dbl>, …
# subset to numeric vars and describe
numVARS<-c("hole_par",
          "strokes",
          "hole_DKP",
          "hole_FDP",
          "hole_SDP",
          "streak_DKP",
          "streak_FDP",
          "streak_SDP",
          "pos",
          "finish_DKP",
          "finish_FDP",
          "finish_SDP",
          "total_DKP",
          "total_FDP",
          "total_SDP",
          "sg_putt",
          "sg_arg",
          "sg_app",
          "sg_ott",
          "sg_t2g",
          "sg_total"
)
numITEMS<-pga[numVARS]

numITEMS %>% 
  summarise(across(everything(), ~ sum(is.na(.x))))
## # A tibble: 1 × 21
##   hole_par strokes hole_DKP hole_FDP hole_SDP streak_DKP streak_FDP streak_SDP
##      <int>   <int>    <int>    <int>    <int>      <int>      <int>      <int>
## 1        0       0        0        0        0          0          0          0
## # ℹ 13 more variables: pos <int>, finish_DKP <int>, finish_FDP <int>,
## #   finish_SDP <int>, total_DKP <int>, total_FDP <int>, total_SDP <int>,
## #   sg_putt <int>, sg_arg <int>, sg_app <int>, sg_ott <int>, sg_t2g <int>,
## #   sg_total <int>
pga<-pga %>%
  filter_at(vars(sg_putt, sg_arg, sg_app, sg_ott, sg_t2g, sg_total), all_vars(!is.na(.)))

numITEMS<-numITEMS %>%
  filter_at(vars(sg_putt, sg_arg, sg_app, sg_ott, sg_t2g, sg_total), all_vars(!is.na(.)))

kable(describe(numITEMS), 
      format='markdown', 
      digits=3)
vars n mean sd median trimmed mad min max range skew kurtosis se
hole_par 1 30495 222.691 70.548 280.00 225.553 11.861 70.00 292.00 222.00 -0.314 -1.794 0.404
strokes 2 30495 221.283 66.935 271.00 223.450 26.687 66.00 313.00 247.00 -0.307 -1.762 0.383
hole_DKP 3 30495 49.507 24.115 52.00 48.764 32.617 -2.50 174.00 176.50 0.255 -0.647 0.138
hole_FDP 4 30495 43.821 24.176 44.40 43.168 31.283 -21.40 134.70 156.10 0.185 -0.995 0.138
hole_SDP 5 30495 48.724 22.281 54.00 48.801 31.135 -11.00 107.00 118.00 -0.066 -1.412 0.128
streak_DKP 6 30495 1.724 2.824 0.00 1.112 0.000 0.00 23.00 23.00 2.089 5.242 0.016
streak_FDP 7 30495 7.550 7.214 6.40 6.612 8.303 0.00 43.60 43.60 0.937 0.276 0.041
streak_SDP 8 30495 1.633 2.629 0.00 1.058 0.000 0.00 22.00 22.00 1.867 3.988 0.015
pos 9 17008 34.141 21.398 32.00 33.428 26.687 1.00 91.00 90.00 0.225 -1.080 0.164
finish_DKP 10 30495 2.363 4.725 0.00 1.196 0.000 0.00 30.00 30.00 3.138 11.563 0.027
finish_FDP 11 30495 2.022 4.664 0.00 0.810 0.000 0.00 30.00 30.00 3.430 13.267 0.027
finish_SDP 12 30495 1.107 2.823 0.00 0.307 0.000 0.00 15.00 15.00 3.097 9.361 0.016
total_DKP 13 30495 53.593 28.903 54.00 51.666 37.065 -2.50 205.50 208.00 0.501 -0.342 0.166
total_FDP 14 30495 53.393 33.321 49.90 51.094 39.289 -21.40 202.60 224.00 0.525 -0.448 0.191
total_SDP 15 30495 51.464 25.325 55.00 50.654 35.582 -11.00 141.00 152.00 0.165 -1.103 0.145
sg_putt 16 30495 -0.120 1.118 -0.04 -0.075 1.038 -5.99 4.43 10.42 -0.449 0.808 0.006
sg_arg 17 30495 -0.040 0.725 0.00 -0.015 0.638 -6.43 3.17 9.60 -0.509 1.963 0.004
sg_app 18 30495 -0.100 1.116 0.00 -0.045 1.023 -9.25 4.67 13.92 -0.608 1.418 0.006
sg_ott 19 30495 -0.045 0.809 0.05 0.015 0.697 -7.74 2.77 10.51 -1.149 3.739 0.005
sg_t2g 20 30495 -0.184 1.636 -0.01 -0.080 1.468 -13.95 6.30 20.25 -0.814 1.784 0.009
sg_total 21 30495 -0.301 1.961 -0.16 -0.187 1.794 -13.67 8.52 22.19 -0.714 1.432 0.011

Course-based Research Questions:

Create course aggregates overall and by season

courseVARS<-c(
  "course",
          "sg_putt",
          "sg_arg",
          "sg_app",
          "sg_ott",
          "sg_t2g",
          "sg_total"
)
courseITEMS<-pga[courseVARS]

datwidenoyr <- courseITEMS %>% group_by(course) %>%   
  mutate(
    avg_sg_putt = mean(sg_putt,na.rm=T),
    avg_sg_arg = mean(sg_arg,na.rm=T),
    avg_sg_app = mean(sg_app,na.rm=T),
    avg_sg_ott = mean(sg_ott,na.rm=T),
    avg_sg_t2g = mean(sg_t2g,na.rm=T),
    avg_sg_total = mean(sg_total,na.rm=T))

aggVARS<-c(
  "course",
          "avg_sg_putt",
          "avg_sg_arg",
          "avg_sg_app",
          "avg_sg_ott",
          "avg_sg_t2g",
          "avg_sg_total"
)
datwidenoyrclean<-datwidenoyr[aggVARS]

datwidenoyrclean<-distinct(datwidenoyrclean)

#### by year 
pga$season<-as.factor(pga$season)

courseVARS<-c(
  "course",
  "season",
          "sg_putt",
          "sg_arg",
          "sg_app",
          "sg_ott",
          "sg_t2g",
          "sg_total"
)
courseITEMS<-pga[courseVARS]

datwideyr <- courseITEMS %>% group_by(course, season) %>%   
  mutate(
    avg_sg_putt = mean(sg_putt,na.rm=T),
    avg_sg_arg = mean(sg_arg,na.rm=T),
    avg_sg_app = mean(sg_app,na.rm=T),
    avg_sg_ott = mean(sg_ott,na.rm=T),
    avg_sg_t2g = mean(sg_t2g,na.rm=T),
    avg_sg_total = mean(sg_total,na.rm=T))

aggVARS<-c(
  "course",
  "season",
          "avg_sg_putt",
          "avg_sg_arg",
          "avg_sg_app",
          "avg_sg_ott",
          "avg_sg_t2g",
          "avg_sg_total"
)
datwideyrclean<-datwideyr[aggVARS]

datwideyrclean<-distinct(datwideyrclean)

1. How does strokes gained vary by course

The metric strokes gained was developed by Columbia business professor Mark Broadie and was a massive advancement in golf analytics: https://www8.gsb.columbia.edu/researcharchive/articles/4996. Strokes Gained measures the quality of each shot based on its starting and ending locations. So, calculations factor in the average number of shots for a golfer to get down from a given starting distance and starting lie condition and the average number of shots to get down from a given end distance and end lie condition. These average numbers are different for professional vs. amateur golfers. Negative numbers represent how many shots a golfer has lost compared to the benchmark, while positive numbers represent how many shots have been gained against said benchmark. Negative numbers highlight areas for improvement, while positive numbers indicate the golfer has performed well.

1. How do different strokes gained metrics vary course to course from 2015-2022?

1a. Which courses had the toughest greens on average from 2015-2022?

datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_putt),]
head(datwidenoyrclean$course)
## [1] "Torrey Pines North - La Jolla, CA"     
## [2] "Sea Island Resort - Sea Island, GA"    
## [3] "Country Club of Jackson - Jackson, MS" 
## [4] "Glen Oaks Club - Old Westbury, NY"     
## [5] "Eagle Point Golf Club - Wilmington, NC"
## [6] "Memorial Park GC - Houston, TX"
ggplot(datwidenoyrclean, aes(avg_sg_putt, reorder(course, avg_sg_putt))) +
        geom_point() + xlab("Avg SG Putting") + ylab("Course") + theme_minimal()

1b. Which courses were toughest around the greens on average from 2015-2022?

datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_arg),]
head(datwidenoyrclean$course)
## [1] "Kiawah Island Golf Resort - Kiawah Island, SC"
## [2] "Winged Foot GC - Mamaroneck, NY"              
## [3] "Glen Oaks Club - Old Westbury, NY"            
## [4] "Congaree Golf Club - Ridgeland, SC"           
## [5] "Ridgewood Country Club - Paramus, NJ"         
## [6] "Muirfield Village Golf Club - Dublin, OH"
ggplot(datwidenoyrclean, aes(avg_sg_arg, reorder(course, avg_sg_arg))) +
        geom_point() + xlab("Avg SG ARG") + ylab("Course") + theme_minimal()

1c. Which courses were toughest on the approach on average from 2015-2022?

datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_app),]
head(datwidenoyrclean$course)
## [1] "Congaree Golf Club - Ridgeland, SC"           
## [2] "Kiawah Island Golf Resort - Kiawah Island, SC"
## [3] "Muirfield Village Golf Club - Dublin, OH"     
## [4] "TPC Sawgrass - Ponte Vedra Beach, FL"         
## [5] "PGA National - Palm Beach Gardens, FL"        
## [6] "TPC Twin Cities - Blaine, MN"
ggplot(datwidenoyrclean, aes(avg_sg_app, reorder(course, avg_sg_app))) +
        geom_point() + xlab("Avg SG App") + ylab("Course") + theme_minimal()

1d. Which courses were toughest off the tee on average from 2015-2022?

datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_ott),]
head(datwidenoyrclean$course)
## [1] "The Concession Golf Club - Bradenton, FL"    
## [2] "Old White Course - White Sulphur Springs, WV"
## [3] "Golf Club of Houston - Houston, TX"          
## [4] "Liberty National - Jersey City, NJ"          
## [5] "TPC Sawgrass - Ponte Vedra Beach, FL"        
## [6] "Congaree Golf Club - Ridgeland, SC"
ggplot(datwidenoyrclean, aes(avg_sg_ott, reorder(course, avg_sg_ott))) +
        geom_point() + xlab("Avg SG OTT") + ylab("Course") + theme_minimal()

1e. Which courses were toughest as measured by total strokes gained from 2015-2022?

datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_total),]
head(datwidenoyrclean$course)
## [1] "Congaree Golf Club - Ridgeland, SC"           
## [2] "TPC Sawgrass - Ponte Vedra Beach, FL"         
## [3] "Torrey Pines North - La Jolla, CA"            
## [4] "Kiawah Island Golf Resort - Kiawah Island, SC"
## [5] "Muirfield Village Golf Club - Dublin, OH"     
## [6] "Glen Oaks Club - Old Westbury, NY"
ggplot(datwidenoyrclean, aes(avg_sg_total, reorder(course, avg_sg_total))) +
        geom_point() + xlab("Avg SG Total") + ylab("Course") + theme_minimal()

2. How have strokes gained fluctuated by course over time?

tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_putt, group = course)) + 
  geom_line() + guides(colour=FALSE) + xlab("Season") +
  ylab("Avg SG Putt")
spag1 = tspag + aes(colour = factor(course)) + theme_bw()

tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_arg, group = course)) + 
  geom_line() + guides(colour=FALSE) + xlab("Season") +
  ylab("Avg SG Around the Green")
spag2 = tspag + aes(colour = factor(course)) + theme_bw()

tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_app, group = course)) + 
  geom_line() + guides(colour=FALSE) + xlab("Season") +
  ylab("Avg SG Approach")
spag3 = tspag + aes(colour = factor(course)) + theme_bw()

tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_ott, group = course)) + 
  geom_line() + guides(colour=FALSE) + xlab("Season") +
  ylab("Avg SG Off the Tee")
spag4 = tspag + aes(colour = factor(course)) + theme_bw()

tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_t2g, group = course)) + 
  geom_line() + guides(colour=FALSE) + xlab("Season") +
  ylab("Avg SG Tee to Green")
spag5 = tspag + aes(colour = factor(course)) + theme_bw()

tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_total, group = course)) + 
  geom_line() + guides(colour=FALSE) + xlab("Season") +
  ylab("Avg SG Total")
spag6 = tspag + aes(colour = factor(course)) + theme_bw()


library(gridExtra)
grid.arrange(spag1, spag2, spag3, spag4, spag5, spag6, nrow = 2,
             top = "Strokes Gained by Course over Time"
)

Player-based Research Questions:

1. Who appeared in the most tournaments since 2015?

1a. What is the average number of tournament appearances?

# count players' appearances
app<-as.data.frame(table(pga$Player_initial_last)) 
app<-app[ order(-app$Freq), ]
head(app)
##           Var1 Freq
## 70   B. Stuard  182
## 432    T. Hoge  175
## 126     D. Lee  174
## 320  N. Taylor  173
## 51   B. Harman  171
## 440 T. Merritt  169
kable(describe(app$Freq), format='markdown', 
      digits=3) 
vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 482 63.268 50.865 56.5 59.759 65.976 1 182 181 0.384 -1.114 2.317
ggplot(app, aes(x=Freq))+
  geom_histogram(color="#FFFFFF", fill="#003C80")+
  scale_x_continuous(breaks = seq(0, 200, by = 10))+
  scale_y_continuous(breaks = seq(0, 90, len = 10))+
  labs(title="Player Tournament Appearances (2015-2022)",x="Frequency", y = "Count")+
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Brian Stuard appeared in the most PGA tournaments from 2015-2022. The average number of tournaments played in was 63. Many players performed in fewer than 20 tournaments.

2. Who won the most tournaments since 2015?

winners <- pga[ which(pga$pos==1), ]

wincount<-winners %>% count(Player_initial_last)
wincount<-wincount[ order(-wincount$n), ]

ggplot(winners, aes(x = fct_infreq(Player_initial_last)))+
  geom_bar(color="#FFFFFF", fill="#CCA600")+
  labs(title="Player Tournament Wins (2015-2022)",x="Player", y = "Count")+
  theme_minimal()+
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))

Dustin Johnson and Justin Thomas had the most tournament wins in this time period.

2a. How have wins changed over time?

library(gghighlight)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
winby<-as.data.frame(table(winners$Player_initial_last, winners$season))

winby <- winby %>% 
       rename("player" = "Var1",
              "season" = "Var2",
              "wins" = "Freq")

p2 <- winby %>% 
  ggplot(aes(x = season, y = wins, group = player)) + 
  labs(x= "Season", y= "Wins") + 
  ggtitle("Tournament Wins Over Time") +
  geom_line() + 
  gghighlight(player == "R. McIlroy")+
  theme_minimal()
## label_key: player
p2

3. Examining max streak DKP, FDP and SDP (fantasy golf / sports betting data)

streakVARS<-c(
           "streak_DKP",
           "streak_FDP",
           "streak_SDP"
)
streakITEMS<-pga[streakVARS]
kable(describe(streakITEMS), format='markdown', 
      digits=3) 
vars n mean sd median trimmed mad min max range skew kurtosis se
streak_DKP 1 30495 1.724 2.824 0.0 1.112 0.000 0 23.0 23.0 2.089 5.242 0.016
streak_FDP 2 30495 7.550 7.214 6.4 6.612 8.303 0 43.6 43.6 0.937 0.276 0.041
streak_SDP 3 30495 1.633 2.629 0.0 1.058 0.000 0 22.0 22.0 1.867 3.988 0.015

4. Information on strokes and strokes gained

## plot by cut
## scale strokes by round
pga$strokesperround<-(pga$strokes/pga$n_rounds)
pga$made_cut<-as.factor(pga$made_cut)

ggplot(pga, aes(x=made_cut, y=strokesperround, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(title="Stokes per Round by Cut", y = "Strokes per Round", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position="bottom")

## strokes and par 
ggplot(pga, aes(x=hole_par, y=strokes)) + 
  geom_point()+
  geom_smooth(method=lm)+
  labs(y = "Total Strokes", x = "Total Hole Par")+
  theme_bw()  
## `geom_smooth()` using formula = 'y ~ x'

## correlation matrix of numeric items
res <- cor(numITEMS, use="pairwise.complete.obs")
library(corrplot)
## corrplot 0.92 loaded
corrplot(res, type = "lower", order = "hclust", 
         tl.col = "black", tl.srt = 45)

# distributions of strokes gained
p1<-ggplot(pga, aes(sg_putt))+
  geom_histogram(color="darkgreen", fill="lightgreen")+
  labs(x="Putt")+
  theme_bw()

p2<-ggplot(pga, aes(sg_arg))+
  geom_histogram(color="darkgreen", fill="lightgreen")+
  labs(x="ARG")+
  theme_bw()

p3<-ggplot(pga, aes(sg_app))+
  geom_histogram(color="darkgreen", fill="lightgreen")+
  labs(x="App")+
  theme_bw()

p4<-ggplot(pga, aes(sg_ott))+
  geom_histogram(color="darkgreen", fill="lightgreen")+
  labs(x="OTT")+
  theme_bw()

p5<-ggplot(pga, aes(sg_t2g))+
  geom_histogram(color="darkgreen", fill="lightgreen")+
  labs(x="t2g")+
  theme_bw()

p6<-ggplot(pga, aes(sg_total))+
  geom_histogram(color="darkgreen", fill="lightgreen")+
  labs(x="Total")+
  theme_bw()

library(gridExtra)
grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 2,
             top = "Strokes Gained"
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

sgVARS<-c(
           "strokes",
           "pos",
           "sg_putt",
           "sg_arg",
           "sg_app",
           "sg_ott",
           "sg_t2g",
           "sg_total"

)
sgITEMS<-pga[sgVARS]
kable(describe(sgITEMS), format='markdown', 
      digits=3) 
vars n mean sd median trimmed mad min max range skew kurtosis se
strokes 1 30495 221.283 66.935 271.00 223.450 26.687 66.00 313.00 247.00 -0.307 -1.762 0.383
pos 2 17008 34.141 21.398 32.00 33.428 26.687 1.00 91.00 90.00 0.225 -1.080 0.164
sg_putt 3 30495 -0.120 1.118 -0.04 -0.075 1.038 -5.99 4.43 10.42 -0.449 0.808 0.006
sg_arg 4 30495 -0.040 0.725 0.00 -0.015 0.638 -6.43 3.17 9.60 -0.509 1.963 0.004
sg_app 5 30495 -0.100 1.116 0.00 -0.045 1.023 -9.25 4.67 13.92 -0.608 1.418 0.006
sg_ott 6 30495 -0.045 0.809 0.05 0.015 0.697 -7.74 2.77 10.51 -1.149 3.739 0.005
sg_t2g 7 30495 -0.184 1.636 -0.01 -0.080 1.468 -13.95 6.30 20.25 -0.814 1.784 0.009
sg_total 8 30495 -0.301 1.961 -0.16 -0.187 1.794 -13.67 8.52 22.19 -0.714 1.432 0.011
res2 <- cor(sgITEMS, use="pairwise.complete.obs")
round(res2, 2)
##          strokes   pos sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## strokes     1.00  0.09    0.35   0.23   0.35   0.24   0.46     0.59
## pos         0.09  1.00   -0.43  -0.29  -0.48  -0.37  -0.67    -0.86
## sg_putt     0.35 -0.43    1.00   0.03  -0.01  -0.03  -0.01     0.55
## sg_arg      0.23 -0.29    0.03   1.00   0.04  -0.01   0.46     0.40
## sg_app      0.35 -0.48   -0.01   0.04   1.00   0.11   0.75     0.61
## sg_ott      0.24 -0.37   -0.03  -0.01   0.11   1.00   0.57     0.45
## sg_t2g      0.46 -0.67   -0.01   0.46   0.75   0.57   1.00     0.82
## sg_total    0.59 -0.86    0.55   0.40   0.61   0.45   0.82     1.00
library(corrplot)
corrplot(res2, type = "lower", order = "hclust", 
         tl.col = "black", tl.srt = 45)

Strokes gained total and tee-to-green are sums of other strokes gained categories.

4b. Strokes gained predicting position
library(sjPlot)
pga$playerid<-as.factor(pga$`player id`)
pga$tourid<-as.factor(pga$`tournament id`)

m1<- lmer(pos ~ sg_ott+sg_app+sg_arg+sg_putt+(1|playerid), data=pga)
#summary(m1)
tab_model(m1)
  pos
Predictors Estimates CI p
(Intercept) 45.10 44.76 – 45.43 <0.001
sg ott -13.21 -13.50 – -12.93 <0.001
sg app -13.09 -13.28 – -12.89 <0.001
sg arg -13.41 -13.71 – -13.12 <0.001
sg putt -13.59 -13.78 – -13.39 <0.001
Random Effects
σ2 112.47
τ00 playerid 6.75
ICC 0.06
N playerid 426
Observations 17008
Marginal R2 / Conditional R2 0.731 / 0.746
#check_model(m1)
#model_performance(m1)

Unsurprisingly, as the total hole par increases, so do the number of strokes. Also unsurprisingly, players who made the cut had fewer stokes per round. Only players that made the cut received a final position ranking. For players that made the cut, each category of strokes gained was negatively related to final position indicating that higher strokes gained corresponded a lower position (better performance). Strokes gained putting and approaching the green were the most strongly related to worse performance.

Additionally, the strokes gained categories were uncorrelated with each other indicating that they are independent. For example, strokes gained putting was not related to strokes gained on the approach. So, if a golfer is struggling in one area, that may be independent from performance in other areas.

After accounting for clustering within player, all categories of of strokes gained statistically predicted final position in players that made the cut.

Strokes gained while putting is often what we hear about in golf coverage and what my high school golf coach always emphasized. However, among players that made tournament cuts, these results indicate that which category of strokes gained is the most impactful for overall performance may vary player to player.

Importantly, since only players that made the cut get a position, these relationships could change in the full sample. Let’s take a look at strokes gained predicting performance but with making the cut (yes/no) as the outcome.

table(pga$made_cut)
## 
##     0     1 
## 12726 17769
p1<-ggplot(pga, aes(x=made_cut, y=sg_ott, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Off the Tee", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "none")

p2<-ggplot(pga, aes(x=made_cut, y=sg_app, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Approach", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "none")

p3<-ggplot(pga, aes(x=made_cut, y=sg_arg, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Around the Green", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "none")

p4<-ggplot(pga, aes(x=made_cut, y=sg_putt, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Putting", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "none")

p5<-ggplot(pga, aes(x=made_cut, y=sg_t2g, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Tee-to-Green", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "none")

p6<-ggplot(pga, aes(x=made_cut, y=sg_total, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Total", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "none")

plegend<-ggplot(pga, aes(x=made_cut, y=sg_total, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Total", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "bottom")

extract_legend <- function(my_ggp) {
  step1 <- ggplot_gtable(ggplot_build(my_ggp))
  step2 <- which(sapply(step1$grobs, function(x) x$name) == "guide-box")
  step3 <- step1$grobs[[step2]]
  return(step3)
}

shared_legend <- extract_legend(plegend)

library(gridExtra)
grid.arrange(arrangeGrob(p1, p2, p3, p4, p5, p6, ncol = 3),
             top = "Strokes Gained by Cut",
             shared_legend, nrow = 2, heights = c(10, 1))

quallogit <- glmer(made_cut ~ sg_ott+sg_app+sg_arg+sg_putt+(1|playerid), data = pga, family = "binomial", control = glmerControl(optimizer = "bobyqa"),
    nAGQ = 10)

print(quallogit, corr=FALSE)
## Generalized linear mixed model fit by maximum likelihood (Adaptive
##   Gauss-Hermite Quadrature, nAGQ = 10) [glmerMod]
##  Family: binomial  ( logit )
## Formula: made_cut ~ sg_ott + sg_app + sg_arg + sg_putt + (1 | playerid)
##    Data: pga
##       AIC       BIC    logLik  deviance  df.resid 
##  26873.48  26923.43 -13430.74  26861.48     30489 
## Random effects:
##  Groups   Name        Std.Dev.
##  playerid (Intercept) 0.2274  
## Number of obs: 30495, groups:  playerid, 484
## Fixed Effects:
## (Intercept)       sg_ott       sg_app       sg_arg      sg_putt  
##      0.7656       1.0290       1.0975       1.0835       1.1755
se <- sqrt(diag(vcov(quallogit)))
(tab <- cbind(Est = fixef(quallogit), LL = fixef(quallogit) - 1.96 * se, UL = fixef(quallogit) + 1.96 *
    se))
##                   Est        LL        UL
## (Intercept) 0.7655784 0.7248984 0.8062585
## sg_ott      1.0289975 0.9841851 1.0738100
## sg_app      1.0975417 1.0629155 1.1321679
## sg_arg      1.0835290 1.0366248 1.1304332
## sg_putt     1.1754735 1.1404946 1.2104523
tab_model(quallogit)
  made_cut
Predictors Odds Ratios CI p
(Intercept) 2.15 2.06 – 2.24 <0.001
sg ott 2.80 2.68 – 2.93 <0.001
sg app 3.00 2.89 – 3.10 <0.001
sg arg 2.96 2.82 – 3.10 <0.001
sg putt 3.24 3.13 – 3.35 <0.001
Random Effects
σ2 3.29
τ00 playerid 0.05
ICC 0.02
N playerid 484
Observations 30495
Marginal R2 / Conditional R2 0.589 / 0.595

Similar to the restricted sample, each category of strokes gained independently statistically predicted if player performance indexed by making the tournament cut. Higher strokes gained corresponded a better chance of making the tournament.

jpeg("spag.jpeg", units = "in", width = 12, res=200)
grid.arrange(spag1, spag2, spag3, spag4, spag5, spag6, nrow = 2, top = "Strokes Gained by Course over Time")
dev.off()
## quartz_off_screen 
##                 2