Introduction

NEON Birds

The U.S National Science Foundation’s National Ecological Observatory Network is a continental-scale observation facility designed to collect long-term ecological data. The data product being used contains the quality-controlled, native sampling resolution data from NEON’s breeding landbird sampling.

Protocols: https://data.neonscience.org/documents/10179/1883155/NEON.DOC.014041vL/2a13bc2b-84db-e4e5-1d33-af8fac1f9e24 Data Product: https://data.neonscience.org/data-products/DP1.10003.001

Occupancy Modeling

Occupancy models are used to help estimate true occupancy of a species and can help account for imperfect detection of organisms in a study.

Occupancy (psi) = the probability that a species occurs at a site. Detection (p) = the probability that a species is detected at a site, given occupancy

Setup

Importing libraries

library(neonUtilities) #package for downloading NEON data

## Warning: package 'neonUtilities' was built under R version 4.3.3

library(RPresence) # library for doing occupancy modelling

## Warning: package 'RPresence' was built under R version 4.4.1

## RPresence_2.15.9
##  - new: static staggered-entry model (type="so.se") added
##  - new: integrated habitat model (type="do.ih") added

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.3.3

library(stringr)
library(ggplot2) # library for visualization

## Warning: package 'ggplot2' was built under R version 4.3.3

Importing NEON Data

large_sites <- c(
  "BARR", "BART", "BONA", "CLBJ", "CPER", "DEJU", "DSNY", "GRSM",
  "GUAN", "HARV", "HEAL", "JERC", "JORN", "KONZ", "MLBS", "MOAB",
  "NIWO", "OAES", "ONAQ", "ORNL", "OSBS", "RMNP", "SJER", "SRER",
  "STEI", "TALL", "TEAK", "UKFS", "UNDE", "WOOD", "WREF", "YELL"
)
small_sites <- c(
  "DELA", "LENO", "TOOL", "SOAP", "STER", "PUUM", "KONA",
  "SERC", "DCFS", "NOGP", "LAJA", "BLAN", "SCBI", "ABBY", "TREE"
)
sites <- c(large_sites, small_sites)

if (!dir.exists("data")) {
  dir.create("data")
}

if (!dir.exists("outputs")) {
  dir.create("outputs")
}

if (file.exists("data/bird_counts.RData")) {
  load("data/bird_counts.RData")
} else {
  
  
  ###todo: add NEON token
  bird.counts <- loadByProduct(dpID="DP1.10003.001",
                               site=sites,
                               startdate="2018-01",
                               enddate="2023-12",
                               check.size = F)
  
  save(bird.counts, file = "data/bird_counts.RData")
}

Data Examination

Describe how the data are set up (i.e., there are two tables, with what data in them?)

CountData:

# Time distribution (number of observations per year)
bird.counts$brd_perpoint %>%
  mutate(year = lubridate::year(startDate)) %>%
  ggplot(aes(x = factor(year))) +
  geom_bar(fill = "forestgreen") +
  ylim(0, 4000) + 
  labs(
    title = "Distribution of Observations Over Time",
    x = "Year",
    y = "Number of Observations"
  ) +
  theme_minimal()

# % missing for each column
missing_pct <- sapply(bird.counts$brd_perpoint, function(x) {
  round(sum(is.na(x)) / length(x) * 100, 2)
})

missing_df <- data.frame(
  column = names(missing_pct),
  missing_percent = missing_pct
) %>%
  filter(missing_percent > 0)  # show only columns with missing values

ggplot(missing_df, aes(x = reorder(column, -missing_percent), y = missing_percent)) +
  geom_bar(stat = "identity", fill = "forestgreen") +
  labs(
    title = "Percentage of Missing Data by Column",
    x = "Column",
    y = "Missing Data (%)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Distribution by site
ggplot(bird.counts$brd_perpoint, aes(x = siteID)) +
  geom_bar(fill = "forestgreen") +
  labs(title = "Distribution of Observations by Site", x = "Site ID", y = "Count") +
  theme(axis.text.x = element_text(angle = 90))

# Species by site
grouped_data <- bird.counts$brd_countdata %>%
    group_by(siteID)
  
summarized_data <- grouped_data %>%
    summarize(num_species = n_distinct(scientificName)) 
    
ggplot(summarized_data, aes(x = siteID, y = num_species)) +
  geom_bar(stat = "identity", fill = "forestgreen") +
  labs(title = "Number of Species Observed by Site", x = "Site ID", y = "Number of Species") +
  theme(axis.text.x = element_text(angle = 90))

PerPoint

# Time distribution (number of observations per year)
bird.counts$brd_countdata %>%
  mutate(year = lubridate::year(startDate)) %>%
  ggplot(aes(x = factor(year))) +
  geom_bar(fill = "steelblue") +
  labs(
    title = "Distribution of Observations Over Time",
    x = "Year",
    y = "Number of Observations"
  ) +
  theme_minimal()

# % missing for each column
missing_pct <- sapply(bird.counts$brd_countdata, function(x) {
  round(sum(is.na(x)) / length(x) * 100, 2)
})

missing_df <- data.frame(
  column = names(missing_pct),
  missing_percent = missing_pct
) %>%
  filter(missing_percent > 0)

ggplot(missing_df, aes(x = reorder(column, -missing_percent), y = missing_percent)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Percentage of Missing Data by Column",
    x = "Column",
    y = "Missing Data (%)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Observations by site
ggplot(bird.counts$brd_countdata, aes(x = siteID)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Distribution of Observations by Site", x = "Site ID", y = "Count") +
  theme(axis.text.x = element_text(angle = 90))

# Species by site
grouped_data <- bird.counts$brd_countdata %>%
    group_by(siteID)
  
summarized_data <- grouped_data %>%
    summarize(num_species = n_distinct(scientificName)) 
    
ggplot(summarized_data, aes(x = siteID, y = num_species)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Number of Species Observed by Site", x = "Site ID", y = "Number of Species") +
  theme(axis.text.x = element_text(angle = 90))

Data Preparation

First, we need to do some generic data cleanup.

Create unique surveyID - we have to create a unique key first consisting of the site, plot, point, year and bout)

brd_perpoint_clean <- bird.counts$brd_perpoint %>%
  mutate(year = str_extract(eventID, "\\d{4}"))%>%
  mutate(pointSurveyID = paste(plotID, "point", pointID, year, "bout", boutNumber, sep = "_")) %>%
  mutate(locationID = paste(plotID, "point", pointID, sep = "_")) %>%
  mutate(timeID = paste(year, "bout", boutNumber, sep = "_"))

brd_countdata_clean <- bird.counts$brd_countdata %>%
  mutate(year = str_extract(eventID, "\\d{4}"))%>%
  mutate(pointSurveyID = paste(plotID, "point", pointID, year, "bout", boutNumber, sep = "_")) %>%
  mutate(locationID = paste(plotID, "point", pointID, sep = "_")) %>%
  mutate(timeID = paste(year, "bout", boutNumber, sep = "_"))

Various data issues.

A large site should have points 1-9 for each plot. If it doesn’t, something is wrong. Remove the whole plot survey if missing a point survey

Note: this isn’t a huge deal for these analyses but this is a problem. Needs to be above any survey removal if using.

Filter out surveys where they could not finish

brd_perpoint_clean <- brd_perpoint_clean %>%
  filter(samplingImpractical == "OK")

Since we’re really only interested in species level data, we need to remove records that are not recorded to the species level or had no records for that pointSurvey minute (taxonRank will be NA)

brd_countdata_clean <- brd_countdata_clean %>%
  filter(taxonRank == "species")

Sometimes there are duplicate pointSurveyIDs and there shouldn’t be - likely errors in data input.

duplicate_pointSurveyID <- brd_perpoint_clean %>%
  count(pointSurveyID) %>%
  filter(n > 1) %>%
  pull(pointSurveyID)

duplicate_pointSurveyID

##  [1] "HARV_004_point_7_2019_bout_1" "HEAL_006_point_1_2019_bout_1"
##  [3] "HEAL_006_point_2_2019_bout_1" "HEAL_006_point_3_2019_bout_1"
##  [5] "HEAL_006_point_4_2019_bout_1" "HEAL_006_point_5_2019_bout_1"
##  [7] "HEAL_006_point_6_2019_bout_1" "HEAL_006_point_7_2019_bout_1"
##  [9] "HEAL_006_point_8_2019_bout_1" "HEAL_006_point_9_2019_bout_1"
## [11] "JERC_011_point_6_2019_bout_1" "SJER_033_point_1_2022_bout_1"
## [13] "SJER_033_point_2_2022_bout_1" "SJER_033_point_3_2022_bout_1"
## [15] "SJER_033_point_4_2022_bout_1" "SJER_033_point_5_2022_bout_1"
## [17] "SJER_033_point_6_2022_bout_1" "SJER_033_point_7_2022_bout_1"
## [19] "SJER_033_point_8_2022_bout_1" "SJER_033_point_9_2022_bout_1"
## [21] "TEAK_031_point_1_2019_bout_1" "TEAK_031_point_2_2019_bout_1"
## [23] "TEAK_031_point_3_2019_bout_1" "TEAK_031_point_4_2019_bout_1"
## [25] "TEAK_031_point_5_2019_bout_1" "TEAK_031_point_6_2019_bout_1"
## [27] "TEAK_031_point_7_2019_bout_1" "TEAK_031_point_8_2019_bout_1"
## [29] "TEAK_031_point_9_2019_bout_1"

#remove both of the duplicate surveys from each table
brd_perpoint_clean <- brd_perpoint_clean %>%
  filter(!pointSurveyID %in% duplicate_pointSurveyID)

brd_countdata_clean <- brd_countdata_clean %>%
  filter(!pointSurveyID %in% duplicate_pointSurveyID)

Do the join

brd_joineddata_clean <- inner_join(
  brd_countdata_clean,
  brd_perpoint_clean,
  by = "pointSurveyID",
  suffix = c(".count", ".perpoint")
)

#Remove duplicate columns between tables
brd_joineddata_clean <- brd_joineddata_clean %>%
  select(-matches("\\.perpoint$"))
names(brd_joineddata_clean) <- gsub("\\.count$", "", names(brd_joineddata_clean))

Data Standardization

Because there is inconsistent sampling across sites (i.e., some sites are larger than others and not all plots within a site were continuously sampled over time), in order to do analyses across sites, we need to make sure that the data are standardized with respect to amount of sampling time.

Actually - a way to get around this might be to just add a detection covariate for time or number of surveys to account for this.

Descriptive Analyses

Great, now that our data are cleaned and standardized, we can start doing some basic analyses. First, lets take a look at a single species within a site in terms of its detection and occupancy.

Single-season Occupancy Model

Single Species

Melanerpes carolinus (Red-bellied Woodpecker) across all sites

Data Munging (Create the occupancy table and covariants, if any)

# Filter table by species
bird_species <- "Melanerpes carolinus"
brd_joineddata_model1 <- brd_joineddata_clean %>%
  filter(siteID %in% sites) %>%
  filter(scientificName == bird_species)

# Get all valid surveys years across sites (so we can fill in NAs)
valid_site_years <- brd_perpoint_clean %>%
  distinct(siteID, year)

# Get detections — only species *actually observed at a site* (so we can fill in 0s)
detections <- brd_joineddata_model1 %>%
  group_by(siteID, year) %>%
  summarize(present = 1, .groups = "drop")

# Join and fill
site_by_year <- valid_site_years %>%
  left_join(detections, by = c("siteID", "year")) %>%
  mutate(present = replace_na(present, 0)) %>%
  pivot_wider(
    names_from = year,
    values_from = present
  )

head(site_by_year, n=10)

Create the Pao presence-absence object (Pao)

birds_pao_model1 <- createPao(
  data = site_by_year[, c("2018", "2019", "2020", "2021", "2022", "2023")],
  unitnames = site_by_year$siteID,
  title = "Red-bellied Woodpecker"
)

Run the actual model

birds_occupancy_model1 <- occMod(
  data = birds_pao_model1,           
  model = list(psi ~ 1,  p ~ 1),
  type = 'so'
  )

summary(birds_occupancy_model1)

## Model name=psi(1)p(1)
## AIC=142.8966
## -2*log-likelihood=138.8966
## num. par=2
## Warning: Numerical convergence may not have been reached. Parameter estimates converged to approximately 7 signifcant digits.

Model Results

This shows that Melanerpes carolinus has an occupancy probability (psi) of 45% (i.e., if you go to any of these sites, there is a 45% chance of this bird occupying the site) and a detection probability (p) of 91% (i.e., if you visit a site and the bird occupies it, there is a 91% chance you will detect it)

#print_one_site_estimates(mod = birds_occupancy_model1, site = 1)

#fitted(object = birds_occupancy_model1, param = 'psi', prob = 0.95)
#fitted(object = birds_occupancy_model1, param = 'p', prob = 0.95)

ests <- as.data.frame(print_one_site_estimates(mod = birds_occupancy_model1))

ests <- ests[1:2,] %>% 
  mutate(parm = c("psi", "p")) %>% 
  select(parm, est, se, lower, upper) %>% 
  rename(., 
    c(
      Parameter = parm,
      Estimate = est,
      SE = se,
      L95 = lower,
      U95 = upper
    )
  ) %>% 
  `rownames<-`(seq_len(nrow(ests[1:2,])))

ests

ggplot(ests, aes(x = Parameter, y = Estimate)) +
  labs(x = "Parameter", y = "Estimate") +
  geom_bar(stat = 'identity') +
  geom_errorbar(aes(ymin = L95, ymax = U95), width = 0.2) +
  theme_bw()

Across all Species

Create a function

todo: change the vars to be just in scope for the function

run_occ_for_species_list <- function(species_df, brd_joineddata_clean, brd_perpoint_clean, sites, years = c("2018", "2019", "2020", "2021", "2022", "2023")) {
  
  results <- list()
  detection_histories <- list()
  
  for (i in seq_len(nrow(species_df))) {
    
    sci_name <- species_df$scientificName[i]
    common_name <- species_df$vernacularName[i]
    
    # Filter detection data to this species and the sites
    brd_species <- brd_joineddata_clean %>%
      filter(siteID %in% sites, scientificName == sci_name)
    
    # Get valid surveys (site-year combos with effort)
    valid_site_years <- brd_perpoint_clean %>%
      filter(siteID %in% sites) %>%
      distinct(siteID, year)
    
    # Get detections for this species
    detections <- brd_species %>%
      group_by(siteID, year) %>%
      summarize(present = 1, .groups = "drop")
    
    # Build site-year detection matrix
    site_by_year <- valid_site_years %>%
      left_join(detections, by = c("siteID", "year")) %>%
      mutate(present = replace_na(present, 0)) %>%
      pivot_wider(
        names_from = year,
        values_from = present
      )
    
    # Create Pao object
    birds_pao <- createPao(
      data = site_by_year[, years],
      unitnames = site_by_year$siteID,
      title = common_name
    )
    
    # Run occupancy model
    model <- occMod(
      data = birds_pao,
      model = list(psi ~ 1, p ~ 1),
      type = "so"
    )
    
    # Extract estimates
    ests <- print_one_site_estimates(mod = model) %>%
      as.data.frame() %>%
      slice(1:2)
    
    rownames(ests) <- NULL
    
    named_est <- list(
      vernacularName = common_name,
      psi_Estimate = ests$est[1],
      psi_SE       = ests$se[1],
      psi_L95      = ests$lower[1],
      psi_U95      = ests$upper[1],
      p_Estimate   = ests$est[2],
      p_SE         = ests$se[2],
      p_L95        = ests$lower[2],
      p_U95        = ests$upper[2]
    )

    detection_histories[[sci_name]] <- site_by_year
    results[[sci_name]] <- named_est
  }
  
  return(list(
    estimates = results,
    detection_histories = detection_histories
  ))
  
}

Run the function across all species

species_input <- brd_joineddata_clean %>%
  distinct(scientificName, vernacularName) %>%
  arrange(scientificName)

results_df <- run_occ_for_species_list(
  species_df = species_input,
  brd_joineddata_clean = brd_joineddata_clean,
  brd_perpoint_clean = brd_perpoint_clean,
  sites = sites
)

Results

Plot psi (occupancy)

psi_df <- bind_rows(lapply(names(results_df$estimates), function(name) {
  row <- results_df$estimates[[name]]
  tibble(
    scientificName = name,
    vernacularName = row$vernacularName,
    psi_Estimate = row$psi_Estimate,
    psi_SE = row$psi_SE,
    psi_L95 = row$psi_L95,
    psi_U95 = row$psi_U95
  )
})) %>%
  arrange(psi_Estimate)

psi_df <- psi_df %>%
  mutate(label = paste0(scientificName))
  #mutate(label = paste0(scientificName, "\n", vernacularName))

ggplot(psi_df, aes(x = reorder(label, psi_Estimate), y = psi_Estimate)) +
  geom_col(fill = "grey") +
  geom_errorbar(aes(ymin = psi_L95, ymax = psi_U95), width = 0.2) +
  labs(
    title = "Species Occupancy (ψ) Estimates",
    x = "Species",
    y = "ψ Estimate"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Plot p (detection)

p_df <- bind_rows(lapply(names(results_df$estimates), function(name) {
  row <- results_df$estimates[[name]]
  tibble(
    scientificName = name,
    vernacularName = row$vernacularName,
    p_Estimate = row$p_Estimate,
    p_SE = row$p_SE,
    p_L95 = row$p_L95,
    p_U95 = row$p_U95
  )
})) %>%
  arrange(p_Estimate)

p_df <- p_df %>%
  mutate(label = paste0(scientificName))
  #mutate(label = paste0(scientificName, "\n", vernacularName))

ggplot(p_df, aes(x = reorder(label, p_Estimate), y = p_Estimate)) +
  geom_col(fill = "grey") +
  geom_errorbar(aes(ymin = p_L95, ymax = p_U95), width = 0.2) +
  labs(
    title = "Species Detection Probability (p) Estimates",
    x = "Species",
    y = "p Estimate"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Take a look at the data for any of the species - we probably aren’t interested in species with a 1 for detection, 1 for occupancy (there likely aren’t any species that have all 1s)

Curious about species with higher occupancy but lower detection

For a given occupancy range, get species and related detections. It will be interesting to see the range.

estimates_df <- bind_rows(lapply(names(results_df$estimates), function(name) {
  row <- results_df$estimates[[name]]
  tibble(
    scientificName = name,
    vernacularName = row$vernacularName,
    psi_Estimate = row$psi_Estimate,
    psi_L95 = row$psi_L95,
    psi_U95 = row$psi_U95,
    p_Estimate = row$p_Estimate,
    p_L95 = row$p_L95,
    p_U95 = row$p_U95
  )
}))

psi_range_species <- estimates_df %>%
  filter(psi_Estimate >= 0.5, psi_Estimate <= 0.6) %>%
  select(scientificName, vernacularName, psi_Estimate, p_Estimate, p_L95, p_U95) %>%
  arrange(desc(p_Estimate))

psi_range_species

Blue Jays and Great Horned Owls have similar occupancies (.51 and .52) but wildly different detections (.91 and .23).

Adding Covariates

Detectability and occupancy of a species can be influenced by many different random effects, such as:

Survey characteristics:

The observer: some observers may be better at spotting individuals of a species that are particularly camouflaged to their environments
The weather: adverse weather conditions such as rain or snow can impact visibility during a study, and therefore impact detection probability

Site characteristics: sites with heavy cover or dense forests may decrease detection probability and make it harder for observers to identify the species

Example: Sitka black-tailed deer that live in the boreal forests in southeast Alaska may be harder to spot than mule deer living in more open ecosystems in Nevada

We can include data about these characteristics in our model to either a) describe previously known variation and thereby improve the model fit or b) test to see whether occupancy/detection of a site is a function of a covariate.

Occupancy Covariates

These are covariates that influence occupancy of a species. Occupancy probability can only be influenced by site characteristics.

For example, shorebird occupancy would benefit from data on the extent of tidal flats, seabirds distributions are often influenced by ocean depth, and in many regions elevation plays a critical role in determining site occupancy.

Melanerpes carolinus (Red-bellied Woodpecker) across all sites

Lets add elevation as a site covariate and see how it improves the model fit.

Data Munging (Create the covariants)

We use the same occupancy table as above, but we need to create a table with the site covariates (aka nlcdClass).

#categorical
#site_cov <- brd_perpoint_clean %>%
#  select(siteID, nlcdClass) %>%
#  distinct(siteID, .keep_all = TRUE) %>%
#  mutate(nlcdClass = as.factor(nlcdClass)) %>%
#  tibble::column_to_rownames("siteID")

#non-categorical
site_cov <- brd_perpoint_clean %>%
  select(siteID, elevation) %>%
  distinct(siteID, .keep_all = TRUE) %>%
  tibble::column_to_rownames("siteID")

todo: Plot the elevation for each site.

Create the Pao presence-absence object (Pao)

birds_pao_model2 <- createPao(
  data = site_by_year[, c("2018", "2019", "2020", "2021", "2022", "2023")],
  unitnames = site_by_year$siteID,
  unitcov = site_cov,
  title = "Red-bellied Woodpecker"
)

Run the actual model

birds_occupancy_model2 <- occMod(
  data = birds_pao_model2,           
  model = list(psi ~ elevation,  p ~ 1),
  type = 'so'
  )

Model Results

mod.list <-  list(
    birds_occupancy_model1, 
    birds_occupancy_model2
)
aictable <- createAicTable(mod.list = mod.list)
aictable$table

Looking at the two models, we can briefly look at the AIC values. The AIC for the model with elevation is lower (better), so the principle of parsimony suggests this is the better model: it is the more parsimonous model when considering both fit and number of parameters.

We see that compared to the original model that did not account for elevation there are 3 parameters vs 2. We can calculate a p value to quantify how much better of a fit this model is.

#get the difference in the neg2loglike between the two models
diff <- abs(birds_occupancy_model1$neg2loglike - birds_occupancy_model2$neg2loglike)

#look up our observed difference on a chi-square distribution with 1 degree of freedom to get p value
pchisq(diff, df = 1, lower.tail = FALSE)

## [1] 2.356024e-05

A p value of 2.356024e-05 is low so we can confidently reject the null hypothesis.

coef(birds_occupancy_model2, param = 'psi', prob = 0.95)

data <- cbind(site_cov, birds_occupancy_model2$real$psi) %>% 
  distinct(.keep_all = TRUE) %>%
  arrange(elevation)

ggplot(data, aes(x = elevation, y = est)) +
  geom_point() + 
  labs(x = "Elevation",
       y = "Predicted Occupancy") +
  lims(y = c(0, 1))

Detection Covariates

Can also be unique for each site as well as the survey

It is important to recognize that detection probability can be influenced by either survey characteristics or site characteristics.

We are going to see whether cloud cover of the survey affects detection of Red-bellied Woodpecker.

Data Munging (Create the covariants)

# Filter table by species
bird_species <- "Melanerpes carolinus"

brd_joineddata_model3 <- brd_joineddata_clean %>%
  filter(siteID %in% large_sites) %>%
  filter(scientificName == bird_species)

# Get all valid surveys years across sites (so we can fill in NAs)
valid_plot_years <- brd_perpoint_clean %>%
  filter(siteID %in% large_sites) %>%
  distinct(plotID, year)

# Get detections — only species *actually observed at a site* (so we can fill in 0s)
detections <- brd_joineddata_model3 %>%
  filter(siteID %in% large_sites) %>%
  group_by(plotID, year
) %>%
  summarize(present = 1, .groups = "drop")

# Join and fill
plot_by_year <- valid_plot_years %>%
  left_join(detections, by = c("plotID","year")) %>%
  mutate(present = replace_na(present, 0)) %>%
  pivot_wider(
    names_from = year,
    values_from = present
  ) %>%
  select(1, sort(names(.)[-1]))

# # Create the detection covariate table: average cloud cover per plot per year
# cloud_cover_by_plot_year <- brd_perpoint_clean %>%
#   filter(siteID %in% large_sites) %>%
#   group_by(plotID, year) %>%
#   summarize(
#     cloudCover = mean((startCloudCoverPercentage + endCloudCoverPercentage) / 2),
#     .groups = "drop"
#   ) %>%
#   pivot_wider(
#     names_from = year,
#     values_from = cloudCover
#   ) %>%
#   tibble::column_to_rownames('plotID') %>%
#   select(sort(names(.)))

# View result
#head(cloud_cover_by_plot_year)

get_mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

detection_method_all_detections <- brd_joineddata_model3 %>%
  group_by(plotID, year) %>%
  summarise(detectionMethod = get_mode(detectionMethod), .groups = "drop")

det_method_matrix <- valid_plot_years %>%
  left_join(detection_method_all_detections, by = c("plotID","year")) %>%
  mutate(detectionMethod = replace_na(detectionMethod, 'not_detected')) %>%
  pivot_wider(
    names_from = year,
    values_from = detectionMethod
  ) %>%
  select(1, sort(names(.)[-1]))

det_method_matrix <- as.data.frame(det_method_matrix)
rownames(det_method_matrix) <- det_method_matrix$plotID
det_method_matrix <- det_method_matrix[ , -1]

#det_method_numeric <- det_method_matrix %>%
#  mutate(across(everything(), ~ as.numeric(as.factor(.))))

We use the same occupancy table as above, but we need to create a table with the survey covariates.

surveycov <- data.frame(
  detMethod = unlist(det_method_matrix)
  )
row.names(surveycov) <- NULL

Create the Pao presence-absence object (Pao)

birds_pao_model3 <- createPao(
  data = plot_by_year[, c("2018", "2019", "2020", "2021", "2022", "2023")],
  unitnames = plot_by_year$plotID,
  survcov = surveycov,
  title = "Red-Bellied Woodpecker"
)

Run the actual model

birds_occupancy_model3 <- occMod(
  data = birds_pao_model3,           
  model = list(psi ~ 1,  p ~ detMethod),
  type = 'so'
  )

Model Results

mod.list <-  list(
    birds_occupancy_model1, 
    birds_occupancy_model3
)
aictable <- createAicTable(mod.list = mod.list)
aictable$table

results <- print_one_site_estimates(
  mod = birds_occupancy_model3,
  site = 1)

ggplot(data = data.frame(results), aes(x = rownames(results), y = est)) +
  geom_bar(stat = 'identity') + 
  geom_errorbar(aes(ymin = lower, ymax = upper, width = 0.2)) +
  lims(y = c(0, 1)) +
  labs(y = "Maximum likelihood estimate",
       x = "Parameter")

data <- data.frame(
  detMethod = birds_pao_model3$survcov$detMethod
  ) %>%
  filter(!is.na(detMethod), detMethod != "not_detected")

ggplot(data, aes(x = detMethod)) + 
  geom_bar(stat = 'count') +
  labs(x = "Detection Method", y = "Number of Surveys Conducted") +
  theme(axis.text.x = element_text(angle = 45, margin = margin(t = 30)))

Multi-season Occupancy Model

Predictive/Causal Hypotheses

NEON Bird Occupancy Modeling

Anushka Gupta

2025-10-08

Introduction

NEON Birds

Occupancy Modeling

Setup

Importing libraries

Importing NEON Data

Data Examination

Data Preparation

Data Standardization

Descriptive Analyses

Single-season Occupancy Model

Single Species

Data Munging (Create the occupancy table and covariants, if any)

Create the Pao presence-absence object (Pao)

Run the actual model

Model Results

Across all Species

Create a function

Results

Adding Covariates

Occupancy Covariates

Data Munging (Create the covariants)

Create the Pao presence-absence object (Pao)

Run the actual model

Model Results

Detection Covariates

Data Munging (Create the covariants)

Create the Pao presence-absence object (Pao)

Run the actual model

Model Results

Multi-season Occupancy Model

Predictive/Causal Hypotheses