library(tidyverse)
library(data.table)
install.packages("ggcorrplot")
library(ggcorrplot)
# Importing our files
powerliftingdata <- fread('openpowerlifting.csv')
meetsdata <- fread('meets.csv')
> glimpse(powerliftingdata)
Observations: 386,414
Variables: 17
$ MeetID 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Name "Angie Belk Terry", "Dawn Bogart", "Dawn Bogart", "Dawn Bogart", "Destiny Dula", "Courtney N...
$ Sex "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F...
$ Equipment "Wraps", "Single-ply", "Single-ply", "Raw", "Raw", "Wraps", "Raw", "Raw", "Wraps", "Raw", "W...
$ Age 47, 42, 42, 42, 18, 28, 60, 60, 52, 52, 24, 56, 56, 54, 17, 32, 25, 18, 22, 56, 30, 44, 34, ...
$ Division "Mst 45-49", "Mst 40-44", "Open Senior", "Open Senior", "Teen 18-19", "Open Senior", "Mst 60...
$ BodyweightKg 59.60, 58.51, 58.51, 58.51, 63.68, 62.41, 67.31, 67.31, 65.95, 65.95, 65.50, 71.21, 71.21, 7...
$ WeightClassKg "60", "60", "60", "60", "67.5", "67.5", "67.5", "67.5", "67.5", "67.5", "67.5", "75", "75", ...
$ Squat4Kg NA, NA, NA, NA, NA, -183.7, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -183.7, ...
$ BestSquatKg 47.63, 142.88, 142.88, NA, NA, 170.10, 124.74, 124.74, 120.20, NA, 138.35, 120.20, 120.20, 1...
$ Bench4Kg NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ BestBenchKg 20.41, 95.25, 95.25, 95.25, 31.75, 77.11, 95.25, 95.25, 54.43, NA, 65.77, 43.09, 43.09, 74.8...
$ Deadlift4Kg NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 204.12, NA, ...
$ BestDeadliftKg 70.31, 163.29, 163.29, NA, 90.72, 145.15, 163.29, 163.29, 108.86, 108.86, 136.08, 129.27, 12...
$ TotalKg 138.35, 401.42, 401.42, 95.25, 122.47, 392.36, 383.28, 383.28, 283.49, 108.86, 340.20, 292.5...
$ Place "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1...
$ Wilks 155.05, 456.38, 456.38, 108.29, 130.47, 424.40, 391.98, 391.98, 294.25, 112.99, 354.89, 287....
> glimpse(meetsdata)
Observations: 8,482
Variables: 8
$ MeetID 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 2...
$ MeetPath "365strong/1601", "365strong/1602", "365strong/1603", "365strong/1604", "365strong/1605", "365s...
$ Federation "365Strong", "365Strong", "365Strong", "365Strong", "365Strong", "365Strong", "365Strong", "365...
$ Date "2016-10-29", "2016-11-19", "2016-07-09", "2016-06-11", "2016-04-10", "2017-04-22", "2017-01-21...
$ MeetCountry "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA...
$ MeetState "NC", "MO", "NC", "SC", "SC", "NC", "GA", "MO", "SC", "NC", "IN", "IN", "NC", "SC", "CA", "NV",...
$ MeetTown "Charlotte", "Ozark", "Charlotte", "Rock Hill", "Rock Hill", "Charlotte", "Martinez", "Lake Oza...
$ MeetName "2016 Junior & Senior National Powerlifting Championships", "Thanksgiving Powerlifting Classic"...
> powerliftingdata %>%
+ distinct(Name) %>%
+ count()
# A tibble: 1 x 1
n
1 136687
> powerliftingdata %>%
+ select(BodyweightKg) %>%
+ is.na() %>%
+ sum()
[1] 2402
> powerliftingdata %>%
+ select(Age) %>%
+ summarize(missingval = sum(is.na(Age)),
+ min = min(Age, na.rm = T),
+ max = max(Age, na.rm = T),
+ mean = mean(Age, na.rm = T))
missingval min max mean
1 239267 5 95 31.66824
> powerliftingdata %>%
+ select(Division) %>%
+ distinct() %>%
+ count()
# A tibble: 1 x 1
n
1 4247
> powerliftingdata %>%
+ filter( BodyweightKg <= 0 |
+ BestSquatKg <= 0 |
+ BestBenchKg <= 0 |
+ BestDeadliftKg <= 0) %>%
+ count()
# A tibble: 1 x 1
n
1 2919
#Removing unecessary columns, extreme ages and missing body weights
powerliftingdataclean <- powerliftingdata %>%
select(-Division, -Squat4Kg, -Bench4Kg, -Deadlift4Kg,
-TotalKg, -Wilks, -WeightClassKg, -Place) %>%
filter(Age >= 15 & Age <= 80) %>%
na.omit(cols='BodyweightKg')
# Also having gender as 'F' and 'M' makes analysis hard/worse, let's turn it into factors.
powerliftingdataclean$Sex <- powerliftingdataclean$Sex %>%
as.factor()
# Let's also turn Equipment to factors since we only have 4 uniques
powerliftingdataclean$Equipment <- powerliftingdataclean$Equipment %>%
as.factor()
# Also there seem to be about 1100 values that are negative (weights), let's remove those
powerliftingdataclean <- powerliftingdataclean %>%
filter( BodyweightKg >= 0 &
BestSquatKg >= 0 &
BestBenchKg >= 0 &
BestDeadliftKg >= 0)
# Let's order the Equipment so our graphs look nice, Raw -> Wraps -> Single-ply -> Multi-ply
powerliftingdataequipordered <- powerliftingdataclean
powerliftingdataequipordered$Equipment <- factor(powerliftingdataequipordered$Equipment,
levels = c('Raw','Wraps','Single-ply','Multi-ply'),
ordered = TRUE)
Firstname | Lastname | |
---|---|---|
John | Doe | john@example.com |
Mary | Moe | mary@example.com |
July | Dooley | july@example.com |
> test2 <- powerliftingdataclean %>%
+ group_by(Equipment) %>%
+ summarize(AvgAge = mean(Age),
+ Equipchoice = paste0(round(100*n()/105252,1), '%'),
+ AvgWeight = mean(BodyweightKg),
+ AvgTotal = mean(TotalKg),
+ AvgBench = mean(BestBenchKg),
+ AvgSquat = mean(BestSquatKg),
+ AvgDeadlift = mean(BestDeadliftKg))
> test2
# A tibble: 4 x 8
Equipment AvgAge Equipchoice AvgWeight AvgTotal AvgBench AvgSquat AvgDeadlift
1 Multi-ply 36.5 1.7% 98.6 711 188 281 242
2 Raw 29.3 65.5% 83.4 468 108 165 195
3 Single-ply 31.9 12.2% 84.5 572 143 216 214
4 Wraps 30.9 20.6% 92.0 548 129 201 218
> powerliftingdataclean %>%
+ group_by(Sex) %>%
+ summarize(AvgAge = mean(Age),
+ Count = length(Sex),
+ AvgWeight = mean(BodyweightKg),
+ AvgTotal = mean(TotalKg),
+ AvgBench = mean(BestBenchKg),
+ AvgSquat = mean(BestSquatKg),
+ AvgDeadlift = mean(BestDeadliftKg))
# A tibble: 2 x 8
Sex AvgAge Count AvgWeight AvgTotal AvgBench AvgSquat AvgDeadlift
1 F 30.9 33901 69.3 322 66.7 117 138
2 M 29.7 71351 93.3 586 143 211 233