Lesson 5 Importing Data

In the previous lesson, we extracted a subset of the EBD containing Yellow-rumped Warbler observations from Guatemala. The output file created by auk_filter() is a tab-separated text file and could be read into R using read.delim() or readr::read_tsv(); however, auk has a function specifically for reading the EBD. read_ebd() does the following:

Reads the data using data.table::fread(), which is much faster than read.delim().
Sets the correct data types for the columns.
Cleans up the column names so they are all snake_case.
Automatically performs some post processing steps, which will be covered later in this lesson.

Let’s read in the data!

library(auk)
library(tidyverse)

ebd <- read_ebd("data/ebd_yerwar.txt", unique = FALSE, rollup = FALSE)
glimpse(ebd)
#> Observations: 160
#> Variables: 46
#> $ global_unique_identifier     <chr> "URN:CornellLabOfOrnithology:EBIRD:OBS239833810", "URN:CornellLabOfOrnithology:E…
#> $ last_edited_date             <chr> "2018-09-09 12:59:27", "2017-08-29 11:00:49", "2014-04-03 23:33:43", "2015-06-23…
#> $ taxonomic_order              <dbl> 32863, 32859, 32858, 32858, 32858, 32863, 32859, 32860, 32858, 32858, 32858, 328…
#> $ category                     <chr> "issf", "issf", "species", "species", "species", "issf", "issf", "issf", "specie…
#> $ common_name                  <chr> "Yellow-rumped Warbler", "Yellow-rumped Warbler", "Yellow-rumped Warbler", "Yell…
#> $ scientific_name              <chr> "Setophaga coronata", "Setophaga coronata", "Setophaga coronata", "Setophaga cor…
#> $ subspecies_common_name       <chr> "Yellow-rumped Warbler (Goldman's)", "Yellow-rumped Warbler (Myrtle)", NA, NA, N…
#> $ subspecies_scientific_name   <chr> "Setophaga coronata goldmani", "Setophaga coronata coronata", NA, NA, NA, "Setop…
#> $ observation_count            <chr> "12", "8", "11", "3", "1", "15", "4", "X", "3", "X", "X", "2", "1", "20", "1", "…
#> $ breeding_bird_atlas_code     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ breeding_bird_atlas_category <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ age_sex                      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Female, Adu…
#> $ country                      <chr> "Guatemala", "Guatemala", "Guatemala", "Guatemala", "Guatemala", "Guatemala", "G…
#> $ country_code                 <chr> "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "GT", "G…
#> $ state                        <chr> "Huehuetenango", "Petén", "Huehuetenango", "Jalapa", "Quetzaltenango", "Huehuete…
#> $ state_code                   <chr> "GT-HU", "GT-PE", "GT-HU", "GT-JA", "GT-QZ", "GT-HU", "GT-PE", "GT-PE", "GT-PE",…
#> $ county                       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ county_code                  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ iba_code                     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ bcr_code                     <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ usfws_code                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ atlas_block                  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ locality                     <chr> "Cerro de los Cuervos", "Tikal Area", "Capellania to La Ventosa", "Cerro Alto, J…
#> $ locality_id                  <chr> "L2713729", "L4754006", "L2713730", "L3222877", "L1380407", "L2713729", "L475400…
#> $ locality_type                <chr> "P", "P", "P", "P", "H", "P", "P", "P", "P", "H", "P", "P", "H", "P", "H", "H", …
#> $ latitude                     <dbl> 15.5, 17.2, 15.5, 14.7, 14.7, 15.5, 17.2, 17.2, 17.2, 15.2, 17.2, 17.2, 17.2, 14…
#> $ longitude                    <dbl> -91.5, -89.6, -91.5, -90.0, -91.5, -91.5, -89.6, -89.6, -90.3, -90.2, -89.6, -89…
#> $ observation_date             <date> 2014-03-07, 2014-01-19, 2014-03-05, 2014-01-26, 2014-12-24, 2014-03-05, 2014-01…
#> $ time_observations_started    <chr> "07:10:00", "14:00:00", "10:55:00", "06:30:00", "09:00:00", "06:15:00", "14:00:0…
#> $ observer_id                  <chr> "obsr200421", "obsr837809", "obsr200421", "obsr411126", "obsr553524", "obsr32598…
#> $ sampling_event_identifier    <chr> "S17445204", "S36593691", "S17445203", "S20932409", "S21010046", "S17445397", "S…
#> $ protocol_type                <chr> "Traveling", "Traveling", "Traveling", "Traveling", "Traveling", "Traveling", "T…
#> $ protocol_code                <chr> "P22", "P22", "P22", "P22", "P22", "P22", "P22", "P22", "P22", "P22", "P22", "P2…
#> $ project_code                 <chr> "EBIRD", "EBIRD", "EBIRD", "EBIRD", "EBIRD", "EBIRD", "EBIRD", "EBIRD", "EBIRD",…
#> $ duration_minutes             <int> 170, 210, 55, 300, 120, 210, 180, 240, 60, 300, 240, 180, 60, 60, 60, 210, 120, …
#> $ effort_distance_km           <dbl> 2.01, 1.61, 4.83, 5.00, 2.41, 2.01, 1.61, 3.22, 0.50, 3.22, 3.22, 0.25, 2.00, NA…
#> $ effort_area_ha               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ number_observers             <int> 2, 3, 3, 6, 1, 3, 3, 14, 4, 12, 14, 1, 2, 3, 6, 2, 3, 2, 2, 2, 2, 3, 3, 3, 4, 2,…
#> $ all_species_reported         <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
#> $ group_identifier             <chr> "G828557", "G2390002", "G828560", NA, "G1125157", "G828561", "G2390003", NA, NA,…
#> $ has_media                    <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALS…
#> $ approved                     <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
#> $ reviewed                     <lgl> TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
#> $ reason                       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ trip_comments                <chr> NA, NA, "Driving w/ stops", NA, "Walked up the trail from the hot springs to rid…
#> $ species_comments             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "subespecie Audubon. Todos l…

We’ll cover the use of unique = FALSE and rollup = FALSE next. For now, let’s just look at the data.

Exercise

Take a minute to explore these data using glimpse() and View(). Familiarize yourself with the columns. Be sure you can find the effort columns and the observation_count column.

Checkpoint

Do you have the data in a data frame? Does anyone have any questions about the data so far?

5.1 Group checklists

eBird allows users to share checklists with other eBird users that they’re birding with. This results it multiple copies of some checklists in the database. Group checklists can be identified in the data because they have the group_identifier column populated. Let’s take a look at some these checklists.

ebd %>% 
  filter(!is.na(group_identifier)) %>% 
  arrange(group_identifier) %>% 
  select(sampling_event_identifier, group_identifier) %>% 
  head()
#> # A tibble: 6 x 2
#>   sampling_event_identifier group_identifier
#>   <chr>                     <chr>           
#> 1 S20741847                 G1059449        
#> 2 S20713245                 G1059449        
#> 3 S20713307                 G1059450        
#> 4 S20741848                 G1059450        
#> 5 S20925877                 G1072982        
#> 6 S20929011                 G1072982

We see that there are multiple checklists with the same group_identifier, implying that these checklists have been shared and are duplicates. Let’s look at one of these on the eBird website: https://ebird.org/view/checklist/S20741847

As it turns out, group checklists aren’t exact duplicates; once a checklist has been shared the individual checklists can diverge in terms of the species seen, the counts for each species, and even the protocol and effort. For an example, look at this checklist with six observers each of whom saw a different set of species.

In most cases, you’ll only want to retain one of these checklists, but it’s not trivial to do so because the checklists are only partial duplicates. The function auk_unique() manages this for you. Specifically, for each species, it retains only the first observation of that species, which is typically the one submitted by the primary observer (i.e. the person who submit the checklist to eBird). Note that the resulting “checklist” will be a combination of all the species seen across all copies of the group checklist.

keep_one <- auk_unique(ebd)
nrow(ebd)
#> [1] 160
nrow(keep_one)
#> [1] 104

When auk_unique() is run, a new field is created (checklist_id), which is populated with group_identifier for group checklists and sampling_event_identifier otherwise; this is now a unique identifier for checklists. In addition, the full set of observer and sampling event identifiers has been retained in a comma separated format.

keep_one %>% 
  filter(!is.na(group_identifier)) %>% 
  select(checklist_id, sampling_event_identifier, 
         group_identifier, observer_id) %>% 
  head()
#> # A tibble: 6 x 4
#>   checklist_id sampling_event_identifier               group_identifier observer_id                                
#>   <chr>        <chr>                                   <chr>            <chr>                                      
#> 1 G828558      S17445097,S17445389                     G828558          obsr200421,obsr325989                      
#> 2 G828561      S17445202,S17445397                     G828561          obsr200421,obsr325989                      
#> 3 G828560      S17445203,S17445396                     G828560          obsr200421,obsr325989                      
#> 4 G828557      S17445204,S17445204,S17445386,S17445386 G828557          obsr200421,obsr200421,obsr325989,obsr325989
#> 5 G828555      S17445206,S17445382                     G828555          obsr200421,obsr325989                      
#> 6 G828571      S17445344,S17445420                     G828571          obsr200421,obsr325989

By default, whenever you import data with read_ebd() it calls auk_unique() automatically; however, this behavior can be controlled with the unique argument. So, for example, the following will import data and remove duplicates.

ebd <- read_ebd("data/ebd_yerwar.txt", rollup = FALSE)

Tip

auk_unique() takes a long time to run on large datasets. Consider using read_ebd(unique = FALSE) when importing large text files to speed up the process.

5.2 Taxonomy

eBird users can enter data for a wide range of taxa in addition to species. Observations can be reported at a level more granular than species (e.g. subspecies or recognizable forms) or at a higher level than species (e.g. spuhs, slashes, and hybrids). All the different taxa that can be reported are contained in the eBird taxonomy, which is updated every year in August. The eBird Science page has a subsection with details on the eBird taxonomy, and the taxonomy itself is available as a data frame in the auk package.

glimpse(ebird_taxonomy)
#> Observations: 16,513
#> Variables: 8
#> $ scientific_name <chr> "Struthio camelus", "Struthio molybdophanes", "Struthio camelus/molybdophanes", "Rhea america…
#> $ common_name     <chr> "Common Ostrich", "Somali Ostrich", "Common/Somali Ostrich", "Greater Rhea", "Lesser Rhea", "…
#> $ species_code    <chr> "ostric2", "ostric3", "y00934", "grerhe1", "lesrhe2", "lesrhe4", "lesrhe3", "tabtin1", "higti…
#> $ category        <chr> "species", "species", "slash", "species", "species", "issf", "issf", "species", "species", "i…
#> $ taxon_order     <dbl> 1, 6, 7, 8, 14, 15, 18, 19, 20, 21, 26, 27, 30, 35, 36, 39, 52, 53, 54, 55, 56, 71, 72, 73, 7…
#> $ order           <chr> "Struthioniformes", "Struthioniformes", "Struthioniformes", "Rheiformes", "Rheiformes", "Rhei…
#> $ family          <chr> "Struthionidae", "Struthionidae", "Struthionidae", "Rheidae", "Rheidae", "Rheidae", "Rheidae"…
#> $ report_as       <chr> NA, NA, NA, NA, NA, "lesrhe2", "lesrhe2", NA, NA, "higtin1", "higtin1", NA, NA, NA, NA, NA, N…
# you can even report that you saw a generic bird!
filter(ebird_taxonomy, common_name == "bird sp.")
#>   scientific_name common_name species_code category taxon_order order family report_as
#> 1        Aves sp.    bird sp.        bird1     spuh       34501  <NA>   <NA>      <NA>

For taxa below the species level, the report_as field specifies the species that this taxa falls under. For example, Myrtle warbler rolls up to Yellow-rumped Warbler.

# myrtle warbler
filter(ebird_taxonomy, common_name == "Yellow-rumped Warbler (Myrtle)") %>% 
  select(common_name, category, report_as)
#>                      common_name category report_as
#> 1 Yellow-rumped Warbler (Myrtle)     issf    yerwar
# rolls up to yellow-rumped warbler
filter(ebird_taxonomy, species_code == "yerwar") %>% 
  select(common_name, category, report_as)
#>             common_name category report_as
#> 1 Yellow-rumped Warbler  species      <NA>

Exercise

How many different subspecies of Barn Swallow does eBird recognize?

Start by finding the species code for Barn Swallow, then find records in ebird_taxonomy with this code in the report_as column.

ebird_taxonomy %>% 
  filter(common_name == "Barn Swallow") %>% 
  select(common_name, species_code)
#>    common_name species_code
#> 1 Barn Swallow       barswa
ebird_taxonomy %>% 
  filter(report_as == "barswa") %>% 
  select(common_name, category, report_as)
#>                    common_name category report_as
#> 1 Barn Swallow (White-bellied)     issf    barswa
#> 2      Barn Swallow (Egyptian)     issf    barswa
#> 3        Barn Swallow (Levant)     issf    barswa
#> 4      Barn Swallow (Tytler's)     issf    barswa
#> 5  Barn Swallow (Buff-bellied)     issf    barswa
#> 6      Barn Swallow (American)     issf    barswa

eBird recognizes six subspecies.

The EBD contains a subspecies column, which is populated when an observer has identified a bird below species level. In the EBD extract we’re working with, we have three different subspecies of Yellow-rumped Warbler:

count(ebd, common_name, subspecies_common_name)
#> # A tibble: 4 x 3
#>   common_name           subspecies_common_name                n
#>   <chr>                 <chr>                             <int>
#> 1 Yellow-rumped Warbler Yellow-rumped Warbler (Audubon's)    27
#> 2 Yellow-rumped Warbler Yellow-rumped Warbler (Goldman's)     4
#> 3 Yellow-rumped Warbler Yellow-rumped Warbler (Myrtle)       16
#> 4 Yellow-rumped Warbler <NA>                                 57

It’s even possible to have multiple subspecies of the same species on a single checklist.

filter(ebd, checklist_id == "S22725024") %>% 
  select(checklist_id, common_name, subspecies_common_name, observation_count)
#> # A tibble: 2 x 4
#>   checklist_id common_name           subspecies_common_name            observation_count
#>   <chr>        <chr>                 <chr>                             <chr>            
#> 1 S22725024    Yellow-rumped Warbler Yellow-rumped Warbler (Audubon's) 1                
#> 2 S22725024    Yellow-rumped Warbler Yellow-rumped Warbler (Myrtle)    1

For most uses, you’ll want eBird data at the species level, which means dropping higher level taxa and rolling lower level taxa up to species level, making sure to sum the counts if multiple subspecies were present. The function auk_rollup() handles these taxonomic matters for you.

no_subsp <- auk_rollup(ebd)
no_subsp %>% 
  filter(checklist_id == "S22725024") %>% 
  select(checklist_id, common_name, observation_count)
#> # A tibble: 1 x 3
#>   checklist_id common_name           observation_count
#>   <chr>        <chr>                 <chr>            
#> 1 S22725024    Yellow-rumped Warbler 2

By default, when you import data with read_ebd() it calls auk_rollup() automatically; however, this behavior can be controlled with the rollup argument. So, for example, the following will import data and remove duplicates and report all records at species level.

ebd <- read_ebd("data/ebd_yerwar.txt")

Checkpoint

Any questions on data import, taxonomy, or group checklists?