PGA Tour Data Analysis Update

Analyzing and visualizing PGA tour data from 2015-2022

Data Import

Basic data cleaning and looking at descriptives

# data pulled from: https://www.advancedsportsanalytics.com/pga-overview

# look at var names and info
#names(pga)
#str(pga)
#head(pga)

# drop the empty variables
pga$`Unnamed: 2`<-NULL
pga$`Unnamed: 3`<-NULL
pga$`Unnamed: 4`<-NULL

# look for other NAs
pga[rowSums(is.na(pga))==0,]

## # A tibble: 17,008 × 34
##    Player_initial_last `tournament id` `player id` hole_par strokes hole_DKP
##    <chr>                         <dbl>       <dbl>    <dbl>   <dbl>    <dbl>
##  1 A. Ancer                  401025267        9261      280     276     59.5
##  2 A. Cook                   401025267        9517      280     270     76  
##  3 A. Hadwin                 401025267        5548      280     268     77  
##  4 A. Landry                 401025267        4682      280     272     66.5
##  5 A. Noren                  401025267        3832      280     269     74.5
##  6 A. Putnam                 401025267        5502      280     267     74.5
##  7 A. Scott                  401025267         388      280     276     69  
##  8 A. Wise                   401025267       10577      280     267     83.5
##  9 B. An                     401025267        5285      280     270     70  
## 10 B. DeChambeau             401025267       10046      280     268     77  
## # ℹ 16,998 more rows
## # ℹ 28 more variables: hole_FDP <dbl>, hole_SDP <dbl>, streak_DKP <dbl>,
## #   streak_FDP <dbl>, streak_SDP <dbl>, n_rounds <dbl>, made_cut <dbl>,
## #   pos <dbl>, finish_DKP <dbl>, finish_FDP <dbl>, finish_SDP <dbl>,
## #   total_DKP <dbl>, total_FDP <dbl>, total_SDP <dbl>, player <chr>,
## #   `tournament name` <chr>, course <chr>, date <chr>, purse <dbl>,
## #   season <dbl>, no_cut <dbl>, Finish <chr>, sg_putt <dbl>, sg_arg <dbl>, …

# subset to numeric vars and describe
numVARS<-c("hole_par",
          "strokes",
          "hole_DKP",
          "hole_FDP",
          "hole_SDP",
          "streak_DKP",
          "streak_FDP",
          "streak_SDP",
          "pos",
          "finish_DKP",
          "finish_FDP",
          "finish_SDP",
          "total_DKP",
          "total_FDP",
          "total_SDP",
          "sg_putt",
          "sg_arg",
          "sg_app",
          "sg_ott",
          "sg_t2g",
          "sg_total"
)
numITEMS<-pga[numVARS]

numITEMS %>% 
  summarise(across(everything(), ~ sum(is.na(.x))))

## # A tibble: 1 × 21
##   hole_par strokes hole_DKP hole_FDP hole_SDP streak_DKP streak_FDP streak_SDP
##      <int>   <int>    <int>    <int>    <int>      <int>      <int>      <int>
## 1        0       0        0        0        0          0          0          0
## # ℹ 13 more variables: pos <int>, finish_DKP <int>, finish_FDP <int>,
## #   finish_SDP <int>, total_DKP <int>, total_FDP <int>, total_SDP <int>,
## #   sg_putt <int>, sg_arg <int>, sg_app <int>, sg_ott <int>, sg_t2g <int>,
## #   sg_total <int>

pga<-pga %>%
  filter_at(vars(sg_putt, sg_arg, sg_app, sg_ott, sg_t2g, sg_total), all_vars(!is.na(.)))

numITEMS<-numITEMS %>%
  filter_at(vars(sg_putt, sg_arg, sg_app, sg_ott, sg_t2g, sg_total), all_vars(!is.na(.)))

kable(describe(numITEMS), 
      format='markdown', 
      digits=3)

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
hole_par	1	30495	222.691	70.548	280.00	225.553	11.861	70.00	292.00	222.00	-0.314	-1.794	0.404
strokes	2	30495	221.283	66.935	271.00	223.450	26.687	66.00	313.00	247.00	-0.307	-1.762	0.383
hole_DKP	3	30495	49.507	24.115	52.00	48.764	32.617	-2.50	174.00	176.50	0.255	-0.647	0.138
hole_FDP	4	30495	43.821	24.176	44.40	43.168	31.283	-21.40	134.70	156.10	0.185	-0.995	0.138
hole_SDP	5	30495	48.724	22.281	54.00	48.801	31.135	-11.00	107.00	118.00	-0.066	-1.412	0.128
streak_DKP	6	30495	1.724	2.824	0.00	1.112	0.000	0.00	23.00	23.00	2.089	5.242	0.016
streak_FDP	7	30495	7.550	7.214	6.40	6.612	8.303	0.00	43.60	43.60	0.937	0.276	0.041
streak_SDP	8	30495	1.633	2.629	0.00	1.058	0.000	0.00	22.00	22.00	1.867	3.988	0.015
pos	9	17008	34.141	21.398	32.00	33.428	26.687	1.00	91.00	90.00	0.225	-1.080	0.164
finish_DKP	10	30495	2.363	4.725	0.00	1.196	0.000	0.00	30.00	30.00	3.138	11.563	0.027
finish_FDP	11	30495	2.022	4.664	0.00	0.810	0.000	0.00	30.00	30.00	3.430	13.267	0.027
finish_SDP	12	30495	1.107	2.823	0.00	0.307	0.000	0.00	15.00	15.00	3.097	9.361	0.016
total_DKP	13	30495	53.593	28.903	54.00	51.666	37.065	-2.50	205.50	208.00	0.501	-0.342	0.166
total_FDP	14	30495	53.393	33.321	49.90	51.094	39.289	-21.40	202.60	224.00	0.525	-0.448	0.191
total_SDP	15	30495	51.464	25.325	55.00	50.654	35.582	-11.00	141.00	152.00	0.165	-1.103	0.145
sg_putt	16	30495	-0.120	1.118	-0.04	-0.075	1.038	-5.99	4.43	10.42	-0.449	0.808	0.006
sg_arg	17	30495	-0.040	0.725	0.00	-0.015	0.638	-6.43	3.17	9.60	-0.509	1.963	0.004
sg_app	18	30495	-0.100	1.116	0.00	-0.045	1.023	-9.25	4.67	13.92	-0.608	1.418	0.006
sg_ott	19	30495	-0.045	0.809	0.05	0.015	0.697	-7.74	2.77	10.51	-1.149	3.739	0.005
sg_t2g	20	30495	-0.184	1.636	-0.01	-0.080	1.468	-13.95	6.30	20.25	-0.814	1.784	0.009
sg_total	21	30495	-0.301	1.961	-0.16	-0.187	1.794	-13.67	8.52	22.19	-0.714	1.432	0.011

Course-based Research Questions:

Create course aggregates overall and by season

courseVARS<-c(
  "course",
          "sg_putt",
          "sg_arg",
          "sg_app",
          "sg_ott",
          "sg_t2g",
          "sg_total"
)
courseITEMS<-pga[courseVARS]

datwidenoyr <- courseITEMS %>% group_by(course) %>%   
  mutate(
    avg_sg_putt = mean(sg_putt,na.rm=T),
    avg_sg_arg = mean(sg_arg,na.rm=T),
    avg_sg_app = mean(sg_app,na.rm=T),
    avg_sg_ott = mean(sg_ott,na.rm=T),
    avg_sg_t2g = mean(sg_t2g,na.rm=T),
    avg_sg_total = mean(sg_total,na.rm=T))

aggVARS<-c(
  "course",
          "avg_sg_putt",
          "avg_sg_arg",
          "avg_sg_app",
          "avg_sg_ott",
          "avg_sg_t2g",
          "avg_sg_total"
)
datwidenoyrclean<-datwidenoyr[aggVARS]

datwidenoyrclean<-distinct(datwidenoyrclean)

#### by year 
pga$season<-as.factor(pga$season)

courseVARS<-c(
  "course",
  "season",
          "sg_putt",
          "sg_arg",
          "sg_app",
          "sg_ott",
          "sg_t2g",
          "sg_total"
)
courseITEMS<-pga[courseVARS]

datwideyr <- courseITEMS %>% group_by(course, season) %>%   
  mutate(
    avg_sg_putt = mean(sg_putt,na.rm=T),
    avg_sg_arg = mean(sg_arg,na.rm=T),
    avg_sg_app = mean(sg_app,na.rm=T),
    avg_sg_ott = mean(sg_ott,na.rm=T),
    avg_sg_t2g = mean(sg_t2g,na.rm=T),
    avg_sg_total = mean(sg_total,na.rm=T))

aggVARS<-c(
  "course",
  "season",
          "avg_sg_putt",
          "avg_sg_arg",
          "avg_sg_app",
          "avg_sg_ott",
          "avg_sg_t2g",
          "avg_sg_total"
)
datwideyrclean<-datwideyr[aggVARS]

datwideyrclean<-distinct(datwideyrclean)

1. How does strokes gained vary by course

The metric strokes gained was developed by Columbia business professor Mark Broadie and was a massive advancement in golf analytics: https://www8.gsb.columbia.edu/researcharchive/articles/4996. Strokes Gained measures the quality of each shot based on its starting and ending locations. So, calculations factor in the average number of shots for a golfer to get down from a given starting distance and starting lie condition and the average number of shots to get down from a given end distance and end lie condition. These average numbers are different for professional vs. amateur golfers. Negative numbers represent how many shots a golfer has lost compared to the benchmark, while positive numbers represent how many shots have been gained against said benchmark. Negative numbers highlight areas for improvement, while positive numbers indicate the golfer has performed well.

1. How do different strokes gained metrics vary course to course from 2015-2022?

1a. Which courses had the toughest greens on average from 2015-2022?

datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_putt),]
head(datwidenoyrclean$course)

## [1] "Torrey Pines North - La Jolla, CA"     
## [2] "Sea Island Resort - Sea Island, GA"    
## [3] "Country Club of Jackson - Jackson, MS" 
## [4] "Glen Oaks Club - Old Westbury, NY"     
## [5] "Eagle Point Golf Club - Wilmington, NC"
## [6] "Memorial Park GC - Houston, TX"

ggplot(datwidenoyrclean, aes(avg_sg_putt, reorder(course, avg_sg_putt))) +
        geom_point() + xlab("Avg SG Putting") + ylab("Course") + theme_minimal()

1b. Which courses were toughest around the greens on average from 2015-2022?

datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_arg),]
head(datwidenoyrclean$course)

## [1] "Kiawah Island Golf Resort - Kiawah Island, SC"
## [2] "Winged Foot GC - Mamaroneck, NY"              
## [3] "Glen Oaks Club - Old Westbury, NY"            
## [4] "Congaree Golf Club - Ridgeland, SC"           
## [5] "Ridgewood Country Club - Paramus, NJ"         
## [6] "Muirfield Village Golf Club - Dublin, OH"

ggplot(datwidenoyrclean, aes(avg_sg_arg, reorder(course, avg_sg_arg))) +
        geom_point() + xlab("Avg SG ARG") + ylab("Course") + theme_minimal()

1c. Which courses were toughest on the approach on average from 2015-2022?

datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_app),]
head(datwidenoyrclean$course)

## [1] "Congaree Golf Club - Ridgeland, SC"           
## [2] "Kiawah Island Golf Resort - Kiawah Island, SC"
## [3] "Muirfield Village Golf Club - Dublin, OH"     
## [4] "TPC Sawgrass - Ponte Vedra Beach, FL"         
## [5] "PGA National - Palm Beach Gardens, FL"        
## [6] "TPC Twin Cities - Blaine, MN"

ggplot(datwidenoyrclean, aes(avg_sg_app, reorder(course, avg_sg_app))) +
        geom_point() + xlab("Avg SG App") + ylab("Course") + theme_minimal()

1d. Which courses were toughest off the tee on average from 2015-2022?

datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_ott),]
head(datwidenoyrclean$course)

## [1] "The Concession Golf Club - Bradenton, FL"    
## [2] "Old White Course - White Sulphur Springs, WV"
## [3] "Golf Club of Houston - Houston, TX"          
## [4] "Liberty National - Jersey City, NJ"          
## [5] "TPC Sawgrass - Ponte Vedra Beach, FL"        
## [6] "Congaree Golf Club - Ridgeland, SC"

ggplot(datwidenoyrclean, aes(avg_sg_ott, reorder(course, avg_sg_ott))) +
        geom_point() + xlab("Avg SG OTT") + ylab("Course") + theme_minimal()

1e. Which courses were toughest as measured by total strokes gained from 2015-2022?

datwidenoyrclean <- datwidenoyrclean[order(datwidenoyrclean$avg_sg_total),]
head(datwidenoyrclean$course)

## [1] "Congaree Golf Club - Ridgeland, SC"           
## [2] "TPC Sawgrass - Ponte Vedra Beach, FL"         
## [3] "Torrey Pines North - La Jolla, CA"            
## [4] "Kiawah Island Golf Resort - Kiawah Island, SC"
## [5] "Muirfield Village Golf Club - Dublin, OH"     
## [6] "Glen Oaks Club - Old Westbury, NY"

ggplot(datwidenoyrclean, aes(avg_sg_total, reorder(course, avg_sg_total))) +
        geom_point() + xlab("Avg SG Total") + ylab("Course") + theme_minimal()

2. How have strokes gained fluctuated by course over time?

tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_putt, group = course)) + 
  geom_line() + guides(colour=FALSE) + xlab("Season") +
  ylab("Avg SG Putt")
spag1 = tspag + aes(colour = factor(course)) + theme_bw()

tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_arg, group = course)) + 
  geom_line() + guides(colour=FALSE) + xlab("Season") +
  ylab("Avg SG Around the Green")
spag2 = tspag + aes(colour = factor(course)) + theme_bw()

tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_app, group = course)) + 
  geom_line() + guides(colour=FALSE) + xlab("Season") +
  ylab("Avg SG Approach")
spag3 = tspag + aes(colour = factor(course)) + theme_bw()

tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_ott, group = course)) + 
  geom_line() + guides(colour=FALSE) + xlab("Season") +
  ylab("Avg SG Off the Tee")
spag4 = tspag + aes(colour = factor(course)) + theme_bw()

tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_t2g, group = course)) + 
  geom_line() + guides(colour=FALSE) + xlab("Season") +
  ylab("Avg SG Tee to Green")
spag5 = tspag + aes(colour = factor(course)) + theme_bw()

tspag = ggplot(datwideyrclean, aes(x=season, y=avg_sg_total, group = course)) + 
  geom_line() + guides(colour=FALSE) + xlab("Season") +
  ylab("Avg SG Total")
spag6 = tspag + aes(colour = factor(course)) + theme_bw()


library(gridExtra)
grid.arrange(spag1, spag2, spag3, spag4, spag5, spag6, nrow = 2,
             top = "Strokes Gained by Course over Time"
)

Player-based Research Questions:

1. Who appeared in the most tournaments since 2015?

1a. What is the average number of tournament appearances?

# count players' appearances
app<-as.data.frame(table(pga$Player_initial_last)) 
app<-app[ order(-app$Freq), ]
head(app)

##           Var1 Freq
## 70   B. Stuard  182
## 432    T. Hoge  175
## 126     D. Lee  174
## 320  N. Taylor  173
## 51   B. Harman  171
## 440 T. Merritt  169

kable(describe(app$Freq), format='markdown', 
      digits=3)

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
X1	1	482	63.268	50.865	56.5	59.759	65.976	1	182	181	0.384	-1.114	2.317

ggplot(app, aes(x=Freq))+
  geom_histogram(color="#FFFFFF", fill="#003C80")+
  scale_x_continuous(breaks = seq(0, 200, by = 10))+
  scale_y_continuous(breaks = seq(0, 90, len = 10))+
  labs(title="Player Tournament Appearances (2015-2022)",x="Frequency", y = "Count")+
  theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Brian Stuard appeared in the most PGA tournaments from 2015-2022. The average number of tournaments played in was 63. Many players performed in fewer than 20 tournaments.

2. Who won the most tournaments since 2015?

winners <- pga[ which(pga$pos==1), ]

wincount<-winners %>% count(Player_initial_last)
wincount<-wincount[ order(-wincount$n), ]

ggplot(winners, aes(x = fct_infreq(Player_initial_last)))+
  geom_bar(color="#FFFFFF", fill="#CCA600")+
  labs(title="Player Tournament Wins (2015-2022)",x="Player", y = "Count")+
  theme_minimal()+
  theme(legend.position = "bottom", axis.text.x = element_text(angle=60, size=7, hjust = 1))

Dustin Johnson and Justin Thomas had the most tournament wins in this time period.

2a. How have wins changed over time?

library(gghighlight)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

winby<-as.data.frame(table(winners$Player_initial_last, winners$season))

winby <- winby %>% 
       rename("player" = "Var1",
              "season" = "Var2",
              "wins" = "Freq")

p2 <- winby %>% 
  ggplot(aes(x = season, y = wins, group = player)) + 
  labs(x= "Season", y= "Wins") + 
  ggtitle("Tournament Wins Over Time") +
  geom_line() + 
  gghighlight(player == "R. McIlroy")+
  theme_minimal()

## label_key: player

p2

3. Examining max streak DKP, FDP and SDP (fantasy golf / sports betting data)

streakVARS<-c(
           "streak_DKP",
           "streak_FDP",
           "streak_SDP"
)
streakITEMS<-pga[streakVARS]
kable(describe(streakITEMS), format='markdown', 
      digits=3)

	vars	n	mean	sd	median	trimmed	mad	max	range	skew	kurtosis	se
streak_DKP	1	30495	1.724	2.824	0.0	1.112	0.000	23.0	23.0	2.089	5.242	0.016
streak_FDP	2	30495	7.550	7.214	6.4	6.612	8.303	43.6	43.6	0.937	0.276	0.041
streak_SDP	3	30495	1.633	2.629	0.0	1.058	0.000	22.0	22.0	1.867	3.988	0.015

4. Information on strokes and strokes gained

## plot by cut
## scale strokes by round
pga$strokesperround<-(pga$strokes/pga$n_rounds)
pga$made_cut<-as.factor(pga$made_cut)

ggplot(pga, aes(x=made_cut, y=strokesperround, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(title="Stokes per Round by Cut", y = "Strokes per Round", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position="bottom")

## strokes and par 
ggplot(pga, aes(x=hole_par, y=strokes)) + 
  geom_point()+
  geom_smooth(method=lm)+
  labs(y = "Total Strokes", x = "Total Hole Par")+
  theme_bw()

## `geom_smooth()` using formula = 'y ~ x'

## correlation matrix of numeric items
res <- cor(numITEMS, use="pairwise.complete.obs")
library(corrplot)

## corrplot 0.92 loaded

corrplot(res, type = "lower", order = "hclust", 
         tl.col = "black", tl.srt = 45)

# distributions of strokes gained
p1<-ggplot(pga, aes(sg_putt))+
  geom_histogram(color="darkgreen", fill="lightgreen")+
  labs(x="Putt")+
  theme_bw()

p2<-ggplot(pga, aes(sg_arg))+
  geom_histogram(color="darkgreen", fill="lightgreen")+
  labs(x="ARG")+
  theme_bw()

p3<-ggplot(pga, aes(sg_app))+
  geom_histogram(color="darkgreen", fill="lightgreen")+
  labs(x="App")+
  theme_bw()

p4<-ggplot(pga, aes(sg_ott))+
  geom_histogram(color="darkgreen", fill="lightgreen")+
  labs(x="OTT")+
  theme_bw()

p5<-ggplot(pga, aes(sg_t2g))+
  geom_histogram(color="darkgreen", fill="lightgreen")+
  labs(x="t2g")+
  theme_bw()

p6<-ggplot(pga, aes(sg_total))+
  geom_histogram(color="darkgreen", fill="lightgreen")+
  labs(x="Total")+
  theme_bw()

library(gridExtra)
grid.arrange(p1, p2, p3, p4, p5, p6, nrow = 2,
             top = "Strokes Gained"
)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

sgVARS<-c(
           "strokes",
           "pos",
           "sg_putt",
           "sg_arg",
           "sg_app",
           "sg_ott",
           "sg_t2g",
           "sg_total"

)
sgITEMS<-pga[sgVARS]
kable(describe(sgITEMS), format='markdown', 
      digits=3)

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
strokes	1	30495	221.283	66.935	271.00	223.450	26.687	66.00	313.00	247.00	-0.307	-1.762	0.383
pos	2	17008	34.141	21.398	32.00	33.428	26.687	1.00	91.00	90.00	0.225	-1.080	0.164
sg_putt	3	30495	-0.120	1.118	-0.04	-0.075	1.038	-5.99	4.43	10.42	-0.449	0.808	0.006
sg_arg	4	30495	-0.040	0.725	0.00	-0.015	0.638	-6.43	3.17	9.60	-0.509	1.963	0.004
sg_app	5	30495	-0.100	1.116	0.00	-0.045	1.023	-9.25	4.67	13.92	-0.608	1.418	0.006
sg_ott	6	30495	-0.045	0.809	0.05	0.015	0.697	-7.74	2.77	10.51	-1.149	3.739	0.005
sg_t2g	7	30495	-0.184	1.636	-0.01	-0.080	1.468	-13.95	6.30	20.25	-0.814	1.784	0.009
sg_total	8	30495	-0.301	1.961	-0.16	-0.187	1.794	-13.67	8.52	22.19	-0.714	1.432	0.011

res2 <- cor(sgITEMS, use="pairwise.complete.obs")
round(res2, 2)

##          strokes   pos sg_putt sg_arg sg_app sg_ott sg_t2g sg_total
## strokes     1.00  0.09    0.35   0.23   0.35   0.24   0.46     0.59
## pos         0.09  1.00   -0.43  -0.29  -0.48  -0.37  -0.67    -0.86
## sg_putt     0.35 -0.43    1.00   0.03  -0.01  -0.03  -0.01     0.55
## sg_arg      0.23 -0.29    0.03   1.00   0.04  -0.01   0.46     0.40
## sg_app      0.35 -0.48   -0.01   0.04   1.00   0.11   0.75     0.61
## sg_ott      0.24 -0.37   -0.03  -0.01   0.11   1.00   0.57     0.45
## sg_t2g      0.46 -0.67   -0.01   0.46   0.75   0.57   1.00     0.82
## sg_total    0.59 -0.86    0.55   0.40   0.61   0.45   0.82     1.00

library(corrplot)
corrplot(res2, type = "lower", order = "hclust", 
         tl.col = "black", tl.srt = 45)

Strokes gained total and tee-to-green are sums of other strokes gained categories.

4b. Strokes gained predicting position

library(sjPlot)
pga$playerid<-as.factor(pga$`player id`)
pga$tourid<-as.factor(pga$`tournament id`)

m1<- lmer(pos ~ sg_ott+sg_app+sg_arg+sg_putt+(1|playerid), data=pga)
#summary(m1)
tab_model(m1)

	pos
Predictors	Estimates	CI	p
(Intercept)	45.10	44.76 – 45.43	<0.001
sg ott	-13.21	-13.50 – -12.93	<0.001
sg app	-13.09	-13.28 – -12.89	<0.001
sg arg	-13.41	-13.71 – -13.12	<0.001
sg putt	-13.59	-13.78 – -13.39	<0.001
Random Effects
σ²	112.47
τ₀₀ _playerid	6.75
ICC	0.06
N _playerid	426
Observations	17008
Marginal R² / Conditional R²	0.731 / 0.746

#check_model(m1)
#model_performance(m1)

Unsurprisingly, as the total hole par increases, so do the number of strokes. Also unsurprisingly, players who made the cut had fewer stokes per round. Only players that made the cut received a final position ranking. For players that made the cut, each category of strokes gained was negatively related to final position indicating that higher strokes gained corresponded a lower position (better performance). Strokes gained putting and approaching the green were the most strongly related to worse performance.

Additionally, the strokes gained categories were uncorrelated with each other indicating that they are independent. For example, strokes gained putting was not related to strokes gained on the approach. So, if a golfer is struggling in one area, that may be independent from performance in other areas.

After accounting for clustering within player, all categories of of strokes gained statistically predicted final position in players that made the cut.

Strokes gained while putting is often what we hear about in golf coverage and what my high school golf coach always emphasized. However, among players that made tournament cuts, these results indicate that which category of strokes gained is the most impactful for overall performance may vary player to player.

Importantly, since only players that made the cut get a position, these relationships could change in the full sample. Let’s take a look at strokes gained predicting performance but with making the cut (yes/no) as the outcome.

table(pga$made_cut)

## 
##     0     1 
## 12726 17769

p1<-ggplot(pga, aes(x=made_cut, y=sg_ott, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Off the Tee", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "none")

p2<-ggplot(pga, aes(x=made_cut, y=sg_app, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Approach", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "none")

p3<-ggplot(pga, aes(x=made_cut, y=sg_arg, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Around the Green", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "none")

p4<-ggplot(pga, aes(x=made_cut, y=sg_putt, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Putting", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "none")

p5<-ggplot(pga, aes(x=made_cut, y=sg_t2g, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Tee-to-Green", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "none")

p6<-ggplot(pga, aes(x=made_cut, y=sg_total, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Total", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "none")

plegend<-ggplot(pga, aes(x=made_cut, y=sg_total, fill=made_cut)) + 
  geom_boxplot()+
  theme_bw()+
  labs(y = "Stokes Gained Total", x = "")+
  scale_fill_discrete(name = "Made Cut", labels = c("No", "Yes"))+
  theme(legend.position = "bottom")

extract_legend <- function(my_ggp) {
  step1 <- ggplot_gtable(ggplot_build(my_ggp))
  step2 <- which(sapply(step1$grobs, function(x) x$name) == "guide-box")
  step3 <- step1$grobs[[step2]]
  return(step3)
}

shared_legend <- extract_legend(plegend)

library(gridExtra)
grid.arrange(arrangeGrob(p1, p2, p3, p4, p5, p6, ncol = 3),
             top = "Strokes Gained by Cut",
             shared_legend, nrow = 2, heights = c(10, 1))

quallogit <- glmer(made_cut ~ sg_ott+sg_app+sg_arg+sg_putt+(1|playerid), data = pga, family = "binomial", control = glmerControl(optimizer = "bobyqa"),
    nAGQ = 10)

print(quallogit, corr=FALSE)

## Generalized linear mixed model fit by maximum likelihood (Adaptive
##   Gauss-Hermite Quadrature, nAGQ = 10) [glmerMod]
##  Family: binomial  ( logit )
## Formula: made_cut ~ sg_ott + sg_app + sg_arg + sg_putt + (1 | playerid)
##    Data: pga
##       AIC       BIC    logLik  deviance  df.resid 
##  26873.48  26923.43 -13430.74  26861.48     30489 
## Random effects:
##  Groups   Name        Std.Dev.
##  playerid (Intercept) 0.2274  
## Number of obs: 30495, groups:  playerid, 484
## Fixed Effects:
## (Intercept)       sg_ott       sg_app       sg_arg      sg_putt  
##      0.7656       1.0290       1.0975       1.0835       1.1755

se <- sqrt(diag(vcov(quallogit)))
(tab <- cbind(Est = fixef(quallogit), LL = fixef(quallogit) - 1.96 * se, UL = fixef(quallogit) + 1.96 *
    se))

##                   Est        LL        UL
## (Intercept) 0.7655784 0.7248984 0.8062585
## sg_ott      1.0289975 0.9841851 1.0738100
## sg_app      1.0975417 1.0629155 1.1321679
## sg_arg      1.0835290 1.0366248 1.1304332
## sg_putt     1.1754735 1.1404946 1.2104523

tab_model(quallogit)

	made_cut
Predictors	Odds Ratios	CI	p
(Intercept)	2.15	2.06 – 2.24	<0.001
sg ott	2.80	2.68 – 2.93	<0.001
sg app	3.00	2.89 – 3.10	<0.001
sg arg	2.96	2.82 – 3.10	<0.001
sg putt	3.24	3.13 – 3.35	<0.001
Random Effects
σ²	3.29
τ₀₀ _playerid	0.05
ICC	0.02
N _playerid	484
Observations	30495
Marginal R² / Conditional R²	0.589 / 0.595

Similar to the restricted sample, each category of strokes gained independently statistically predicted if player performance indexed by making the tournament cut. Higher strokes gained corresponded a better chance of making the tournament.

jpeg("spag.jpeg", units = "in", width = 12, res=200)
grid.arrange(spag1, spag2, spag3, spag4, spag5, spag6, nrow = 2, top = "Strokes Gained by Course over Time")
dev.off()

## quartz_off_screen 
##                 2