Joining data

Code for Quiz 6, more dplyr and our first interactive chart

Step 1-6

  1. Load the R packages we will use.
  1. Read the data in the files, drug_cos.csv, health_cos.csv in to R and assign to the variables drug_cos and health_cos, respectively
drug_cos <- read_csv("https://estanny.com/static/week6/drug_cos.csv")
health_cos <- read_csv("https://estanny.com/static/week6/health_cos.csv")
  1. Use glimse to get a glimpse of the data
drug_cos %>% glimpse()
Rows: 104
Columns: 9
$ ticker       <chr> "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS"~
$ name         <chr> "Zoetis Inc", "Zoetis Inc", "Zoetis Inc", "Zoet~
$ location     <chr> "New Jersey; U.S.A", "New Jersey; U.S.A", "New ~
$ ebitdamargin <dbl> 0.149, 0.217, 0.222, 0.238, 0.182, 0.335, 0.366~
$ grossmargin  <dbl> 0.610, 0.640, 0.634, 0.641, 0.635, 0.659, 0.666~
$ netmargin    <dbl> 0.058, 0.101, 0.111, 0.122, 0.071, 0.168, 0.163~
$ ros          <dbl> 0.101, 0.171, 0.176, 0.195, 0.140, 0.286, 0.321~
$ roe          <dbl> 0.069, 0.113, 0.612, 0.465, 0.285, 0.587, 0.488~
$ year         <dbl> 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,~
health_cos %>% glimpse()
Rows: 464
Columns: 11
$ ticker      <chr> "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS",~
$ name        <chr> "Zoetis Inc", "Zoetis Inc", "Zoetis Inc", "Zoeti~
$ revenue     <dbl> 4233000000, 4336000000, 4561000000, 4785000000, ~
$ gp          <dbl> 2581000000, 2773000000, 2892000000, 3068000000, ~
$ rnd         <dbl> 427000000, 409000000, 399000000, 396000000, 3640~
$ netincome   <dbl> 245000000, 436000000, 504000000, 583000000, 3390~
$ assets      <dbl> 5711000000, 6262000000, 6558000000, 6588000000, ~
$ liabilities <dbl> 1975000000, 2221000000, 5596000000, 5251000000, ~
$ marketcap   <dbl> NA, NA, 16345223371, 21572007994, 23860348635, 2~
$ year        <dbl> 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, ~
$ industry    <chr> "Drug Manufacturers - Specialty & Generic", "Dru~
  1. Which variables are the same in both data sets
names_drug <- drug_cos %>% names()
names_health <- health_cos %>% names()
intersect(names_drug, names_health)
[1] "ticker" "name"   "year"  
  1. Select subset of variables to work with

For drug_cos select (in this order): ticker, year, grossmargin

For health_cos select (in this order): ticker, year, revenue, gp, industry

drug_subset <- drug_cos %>%
  select(ticker,year, grossmargin) %>%
  filter(year == 2018)
health_subset <- health_cos %>%
  select(ticker, year, revenue, gp, industry) %>%
  filter(year == 2018)
  1. Keep all the rows and columns drug_subset join with columns in health_subset
drug_subset %>% left_join(health_subset)
# A tibble: 13 x 6
   ticker  year grossmargin     revenue          gp industry          
   <chr>  <dbl>       <dbl>       <dbl>       <dbl> <chr>             
 1 ZTS     2018       0.672  5825000000  3914000000 Drug Manufacturer~
 2 PRGO    2018       0.387  4731700000  1831500000 Drug Manufacturer~
 3 PFE     2018       0.79  53647000000 42399000000 Drug Manufacturer~
 4 MYL     2018       0.35  11433900000  4001600000 Drug Manufacturer~
 5 MRK     2018       0.681 42294000000 28785000000 Drug Manufacturer~
 6 LLY     2018       0.738 24555700000 18125700000 Drug Manufacturer~
 7 JNJ     2018       0.668 81581000000 54490000000 Drug Manufacturer~
 8 GILD    2018       0.781 22127000000 17274000000 Drug Manufacturer~
 9 BMY     2018       0.71  22561000000 16014000000 Drug Manufacturer~
10 BIIB    2018       0.865 13452900000 11636600000 Drug Manufacturer~
11 AMGN    2018       0.827 23747000000 19646000000 Drug Manufacturer~
12 AGN     2018       0.861 15787400000 13596000000 Drug Manufacturer~
13 ABBV    2018       0.764 32753000000 25035000000 Drug Manufacturer~

Question: join_ticker

drug_cos_subset <- drug_cos %>% 
  filter(ticker == "MRK")
drug_cos_subset
# A tibble: 8 x 9
  ticker name  location ebitdamargin grossmargin netmargin   ros   roe
  <chr>  <chr> <chr>           <dbl>       <dbl>     <dbl> <dbl> <dbl>
1 MRK    Merc~ New Jer~        0.305       0.649     0.131 0.15  0.114
2 MRK    Merc~ New Jer~        0.33        0.652     0.13  0.182 0.113
3 MRK    Merc~ New Jer~        0.282       0.615     0.1   0.123 0.089
4 MRK    Merc~ New Jer~        0.567       0.603     0.282 0.409 0.248
5 MRK    Merc~ New Jer~        0.298       0.622     0.112 0.136 0.096
6 MRK    Merc~ New Jer~        0.254       0.648     0.098 0.117 0.092
7 MRK    Merc~ New Jer~        0.278       0.678     0.06  0.162 0.063
8 MRK    Merc~ New Jer~        0.313       0.681     0.147 0.206 0.199
# ... with 1 more variable: year <dbl>

Question: summarize_industry

combo_df <- drug_cos_subset %>%
  left_join(health_cos)
combo_df
# A tibble: 8 x 17
  ticker name  location ebitdamargin grossmargin netmargin   ros   roe
  <chr>  <chr> <chr>           <dbl>       <dbl>     <dbl> <dbl> <dbl>
1 MRK    Merc~ New Jer~        0.305       0.649     0.131 0.15  0.114
2 MRK    Merc~ New Jer~        0.33        0.652     0.13  0.182 0.113
3 MRK    Merc~ New Jer~        0.282       0.615     0.1   0.123 0.089
4 MRK    Merc~ New Jer~        0.567       0.603     0.282 0.409 0.248
5 MRK    Merc~ New Jer~        0.298       0.622     0.112 0.136 0.096
6 MRK    Merc~ New Jer~        0.254       0.648     0.098 0.117 0.092
7 MRK    Merc~ New Jer~        0.278       0.678     0.06  0.162 0.063
8 MRK    Merc~ New Jer~        0.313       0.681     0.147 0.206 0.199
# ... with 9 more variables: year <dbl>, revenue <dbl>, gp <dbl>,
#   rnd <dbl>, netincome <dbl>, assets <dbl>, liabilities <dbl>,
#   marketcap <dbl>, industry <chr>
co_name <- combo_df %>% 
  distinct(name) %>%
  pull()
co_name
[1] "Merck & Co Inc"
co_location <- combo_df %>% 
  distinct(location) %>%
  pull()
co_location
[1] "New Jersey; U.S.A"
co_industry <- combo_df %>% 
  distinct(industry) %>%
  pull()
co_industry
[1] "Drug Manufacturers - General"