class: center, middle, inverse, title-slide .title[ # Exploratory data analysis ] .author[ ### MACS 30500
University of Chicago ] --- # Exploratory data analysis 1. Generate questions about your data 1. Search for answers by visualizing, transforming, and modeling your data 1. Use what you learn to refine your questions and or generate new questions 1. Rinse and repeat until you publish a paper -- * Variation * Covariation --- count: false ## Exploratory of Confirmatory Data Analysis .panel1-penguins-eda-auto[ ```r *ggplot( * data = penguins, * mapping = aes( * x = body_mass_g, * y = flipper_length_mm * ) *) ``` ] .panel2-penguins-eda-auto[ <img src="index_files/figure-html/penguins-eda_auto_01_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Exploratory of Confirmatory Data Analysis .panel1-penguins-eda-auto[ ```r ggplot( data = penguins, mapping = aes( x = body_mass_g, y = flipper_length_mm ) ) + * geom_point() ``` ] .panel2-penguins-eda-auto[ <img src="index_files/figure-html/penguins-eda_auto_02_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Exploratory of Confirmatory Data Analysis .panel1-penguins-eda-auto[ ```r ggplot( data = penguins, mapping = aes( x = body_mass_g, y = flipper_length_mm ) ) + geom_point() + * geom_smooth() ``` ] .panel2-penguins-eda-auto[ <img src="index_files/figure-html/penguins-eda_auto_03_output-1.png" width="80%" style="display: block; margin: auto;" /> ] <style> .panel1-penguins-eda-auto { color: black; width: 38.6060606060606%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel2-penguins-eda-auto { color: black; width: 59.3939393939394%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel3-penguins-eda-auto { color: black; width: NA%; hight: 33%; float: left; padding-left: 1%; font-size: 80% } </style> --- count: false ## Characteristics of Confirmatory Data Analysis .panel1-penguins-final-auto[ ```r *ggplot( * data = penguins, * mapping = aes( * x = body_mass_g, * y = flipper_length_mm * ) *) ``` ] .panel2-penguins-final-auto[ <img src="index_files/figure-html/penguins-final_auto_01_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Characteristics of Confirmatory Data Analysis .panel1-penguins-final-auto[ ```r ggplot( data = penguins, mapping = aes( x = body_mass_g, y = flipper_length_mm ) ) + * geom_point(alpha = .1) ``` ] .panel2-penguins-final-auto[ <img src="index_files/figure-html/penguins-final_auto_02_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Characteristics of Confirmatory Data Analysis .panel1-penguins-final-auto[ ```r ggplot( data = penguins, mapping = aes( x = body_mass_g, y = flipper_length_mm ) ) + geom_point(alpha = .1) + * geom_smooth(se = FALSE) ``` ] .panel2-penguins-final-auto[ <img src="index_files/figure-html/penguins-final_auto_03_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Characteristics of Confirmatory Data Analysis .panel1-penguins-final-auto[ ```r ggplot( data = penguins, mapping = aes( x = body_mass_g, y = flipper_length_mm ) ) + geom_point(alpha = .1) + geom_smooth(se = FALSE) + * labs( * title = "Relationship between body mass and\nflipper length of a penguin", * subtitle = "Sample of 344 penguins", * x = "Body mass(g)", * y = "Flipper length(mm)" * ) ``` ] .panel2-penguins-final-auto[ <img src="index_files/figure-html/penguins-final_auto_04_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Characteristics of Confirmatory Data Analysis .panel1-penguins-final-auto[ ```r ggplot( data = penguins, mapping = aes( x = body_mass_g, y = flipper_length_mm ) ) + geom_point(alpha = .1) + geom_smooth(se = FALSE) + labs( title = "Relationship between body mass and\nflipper length of a penguin", subtitle = "Sample of 344 penguins", x = "Body mass(g)", y = "Flipper length(mm)" ) + * theme_xaringan( * title_font_size = 18, * text_font_size = 16 * ) ``` ] .panel2-penguins-final-auto[ <img src="index_files/figure-html/penguins-final_auto_05_output-1.png" width="80%" style="display: block; margin: auto;" /> ] <style> .panel1-penguins-final-auto { color: black; width: 38.6060606060606%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel2-penguins-final-auto { color: black; width: 59.3939393939394%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel3-penguins-final-auto { color: black; width: NA%; hight: 33%; float: left; padding-left: 1%; font-size: 80% } </style> --- class: inverse, middle # `scorecard` --- ## `scorecard` ```r glimpse(scorecard) ``` ``` ## Rows: 1,732 ## Columns: 14 ## $ unitid <dbl> 100654, 100663, 100706, 100724, 100751, 100830, 100858, 1009… ## $ name <chr> "Alabama A & M University", "University of Alabama at Birmin… ## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", … ## $ type <fct> "Public", "Public", "Public", "Public", "Public", "Public", … ## $ admrate <dbl> 0.9175, 0.7366, 0.8257, 0.9690, 0.8268, 0.9044, 0.8067, 0.53… ## $ satavg <dbl> 939, 1234, 1319, 946, 1261, 1082, 1300, 1230, 1066, NA, 1076… ## $ cost <dbl> 23053, 24495, 23917, 21866, 29872, 19849, 31590, 32095, 3431… ## $ netcost <dbl> 14990, 16953, 15860, 13650, 22597, 13987, 24104, 22107, 2071… ## $ avgfacsal <dbl> 69381, 99441, 87192, 64989, 92619, 71343, 96642, 56646, 5400… ## $ pctpell <dbl> 0.7019, 0.3512, 0.2536, 0.7627, 0.1772, 0.4644, 0.1455, 0.23… ## $ comprate <dbl> 0.2974, 0.6340, 0.5768, 0.3276, 0.7110, 0.3401, 0.7911, 0.69… ## $ firstgen <dbl> 0.3658281, 0.3412237, 0.3101322, 0.3434343, 0.2257127, 0.381… ## $ debt <dbl> 15250, 15085, 14000, 17500, 17671, 12000, 17500, 16000, 1425… ## $ locale <fct> City, City, City, City, City, City, City, City, City, Suburb… ``` --- class: inverse, middle # Univariate graphs --- count: false ## Histogram .panel1-histogram-auto[ ```r *ggplot( * data = scorecard, * mapping = aes(x = cost) *) ``` ] .panel2-histogram-auto[ <img src="index_files/figure-html/histogram_auto_01_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Histogram .panel1-histogram-auto[ ```r ggplot( data = scorecard, mapping = aes(x = cost) ) + * geom_histogram() ``` ] .panel2-histogram-auto[ <img src="index_files/figure-html/histogram_auto_02_output-1.png" width="80%" style="display: block; margin: auto;" /> ] <style> .panel1-histogram-auto { color: black; width: 38.6060606060606%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel2-histogram-auto { color: black; width: 59.3939393939394%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel3-histogram-auto { color: black; width: NA%; hight: 33%; float: left; padding-left: 1%; font-size: 80% } </style> --- count: false ## Histogram .panel1-histogram-bins-rotate[ ```r ggplot( data = scorecard, mapping = aes(x = cost) ) + * geom_histogram(bins = 50) ``` ] .panel2-histogram-bins-rotate[ <img src="index_files/figure-html/histogram-bins_rotate_01_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Histogram .panel1-histogram-bins-rotate[ ```r ggplot( data = scorecard, mapping = aes(x = cost) ) + * geom_histogram(bins = 30) ``` ] .panel2-histogram-bins-rotate[ <img src="index_files/figure-html/histogram-bins_rotate_02_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Histogram .panel1-histogram-bins-rotate[ ```r ggplot( data = scorecard, mapping = aes(x = cost) ) + * geom_histogram(bins = 10) ``` ] .panel2-histogram-bins-rotate[ <img src="index_files/figure-html/histogram-bins_rotate_03_output-1.png" width="80%" style="display: block; margin: auto;" /> ] <style> .panel1-histogram-bins-rotate { color: black; width: 38.6060606060606%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel2-histogram-bins-rotate { color: black; width: 59.3939393939394%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel3-histogram-bins-rotate { color: black; width: NA%; hight: 33%; float: left; padding-left: 1%; font-size: 80% } </style> --- count: false ## Bar chart .panel1-barplot-auto[ ```r *ggplot( * data = scorecard, * mapping = aes(x = type) *) ``` ] .panel2-barplot-auto[ <img src="index_files/figure-html/barplot_auto_01_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Bar chart .panel1-barplot-auto[ ```r ggplot( data = scorecard, mapping = aes(x = type) ) + * geom_bar() ``` ] .panel2-barplot-auto[ <img src="index_files/figure-html/barplot_auto_02_output-1.png" width="80%" style="display: block; margin: auto;" /> ] <style> .panel1-barplot-auto { color: black; width: 38.6060606060606%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel2-barplot-auto { color: black; width: 59.3939393939394%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel3-barplot-auto { color: black; width: NA%; hight: 33%; float: left; padding-left: 1%; font-size: 80% } </style> --- ## Covariation 1. Two-dimensional graphs 1. Multiple window plots 1. Utilizing additional channels --- count: false ## Box plot .panel1-boxplot-auto[ ```r *ggplot( * data = scorecard, * mapping = aes( * x = type, * y = cost * ) *) ``` ] .panel2-boxplot-auto[ <img src="index_files/figure-html/boxplot_auto_01_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Box plot .panel1-boxplot-auto[ ```r ggplot( data = scorecard, mapping = aes( x = type, y = cost ) ) + * geom_boxplot() ``` ] .panel2-boxplot-auto[ <img src="index_files/figure-html/boxplot_auto_02_output-1.png" width="80%" style="display: block; margin: auto;" /> ] <style> .panel1-boxplot-auto { color: black; width: 38.6060606060606%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel2-boxplot-auto { color: black; width: 59.3939393939394%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel3-boxplot-auto { color: black; width: NA%; hight: 33%; float: left; padding-left: 1%; font-size: 80% } </style> --- count: false ## Scatterplot .panel1-scatterplot-auto[ ```r *ggplot( * data = scorecard, * mapping = aes( * x = cost, * y = netcost * ) *) ``` ] .panel2-scatterplot-auto[ <img src="index_files/figure-html/scatterplot_auto_01_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Scatterplot .panel1-scatterplot-auto[ ```r ggplot( data = scorecard, mapping = aes( x = cost, y = netcost ) ) + * geom_point() ``` ] .panel2-scatterplot-auto[ <img src="index_files/figure-html/scatterplot_auto_02_output-1.png" width="80%" style="display: block; margin: auto;" /> ] <style> .panel1-scatterplot-auto { color: black; width: 38.6060606060606%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel2-scatterplot-auto { color: black; width: 59.3939393939394%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel3-scatterplot-auto { color: black; width: NA%; hight: 33%; float: left; padding-left: 1%; font-size: 80% } </style> --- count: false ## Multiple windows plot .panel1-histogram-facet-user[ ```r *ggplot( * data = scorecard, * mapping = aes(x = cost) *) + * geom_histogram() ``` ] .panel2-histogram-facet-user[ <img src="index_files/figure-html/histogram-facet_user_01_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Multiple windows plot .panel1-histogram-facet-user[ ```r ggplot( data = scorecard, mapping = aes(x = cost) ) + geom_histogram() + * facet_wrap(facets = vars(type)) ``` ] .panel2-histogram-facet-user[ <img src="index_files/figure-html/histogram-facet_user_02_output-1.png" width="80%" style="display: block; margin: auto;" /> ] <style> .panel1-histogram-facet-user { color: black; width: 38.6060606060606%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel2-histogram-facet-user { color: black; width: 59.3939393939394%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel3-histogram-facet-user { color: black; width: NA%; hight: 33%; float: left; padding-left: 1%; font-size: 80% } </style> --- count: false ## Multiple windows plot .panel1-scatterplot-facet-user[ ```r *ggplot( * data = scorecard, * mapping = aes( * x = cost, * y = netcost * ) *) + * geom_point() ``` ] .panel2-scatterplot-facet-user[ <img src="index_files/figure-html/scatterplot-facet_user_01_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Multiple windows plot .panel1-scatterplot-facet-user[ ```r ggplot( data = scorecard, mapping = aes( x = cost, y = netcost ) ) + geom_point() + * facet_wrap(facets = vars(type)) ``` ] .panel2-scatterplot-facet-user[ <img src="index_files/figure-html/scatterplot-facet_user_02_output-1.png" width="80%" style="display: block; margin: auto;" /> ] <style> .panel1-scatterplot-facet-user { color: black; width: 38.6060606060606%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel2-scatterplot-facet-user { color: black; width: 59.3939393939394%; hight: 32%; float: left; padding-left: 1%; font-size: 80% } .panel3-scatterplot-facet-user { color: black; width: NA%; hight: 33%; float: left; padding-left: 1%; font-size: 80% } </style> --- count: false ## Utilizing additional channels .panel1-scatterplot-mult-channels-rotate[ ```r ggplot( data = scorecard, mapping = aes( x = cost, y = netcost, * color = type, ) ) + geom_point() ``` ] .panel2-scatterplot-mult-channels-rotate[ <img src="index_files/figure-html/scatterplot-mult-channels_rotate_01_output-1.png" width="80%" style="display: block; margin: auto;" /> ] --- count: false ## Utilizing additional channels .panel1-scatterplot-mult-channels-rotate[ ```r ggplot( data = scorecard, mapping = aes( x = cost, y = netcost, * color = type, size = debt ) ) + geom_point() ``` ] .panel2-scatterplot-mult-channels-rotate[ <img src="index_files/figure-html/scatterplot-mult-channels_rotate_02_output-1.png" width="80%" style="display: block; margin: auto;" /> ] <style> .panel1-scatterplot-mult-channels-rotate { color: black; width: 38.6060606060606%; hight: 32%; float: left; padding-left: 1%; font-size: 60% } .panel2-scatterplot-mult-channels-rotate { color: black; width: 59.3939393939394%; hight: 32%; float: left; padding-left: 1%; font-size: 60% } .panel3-scatterplot-mult-channels-rotate { color: black; width: NA%; hight: 33%; float: left; padding-left: 1%; font-size: 60% } </style>