prep-sampling-optimization/minimumSite_functions.R at main · QuantMarineEcoLab/prep-sampling-optimization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# Functions to calculate the minimum # of sites required to detect trends

# Clean column names (lowercase, snake case, etc.)
clean_vec <- function(x, refactor = FALSE) {
  require(magrittr, quietly = TRUE)

  if (!(is.character(x) || is.factor(x))) {
    return(x)
  } else {
    old_names <- as.character(x)

    new_names <- old_names %>%
      gsub("'", "", .) %>%
      gsub("\"", "", .) %>%
      gsub("%", "percent", .) %>%
      gsub("^[ ]+", "", .) %>%
      make.names(.) %>%
      gsub("[.]+", "_", .) %>%
      gsub("[_]+", "_", .) %>%
      tolower(.) %>%
      gsub("_$", "", .)

    return(new_names)
  }
}

# Standardize response values
standardize <- function(data) {

  # Ensure year is numeric
  data$year <- as.numeric(as.character(data$year))

  # Standardize the response column
  data$stand.response <- (data$response - mean(data$response, na.rm = TRUE)) /
    sd(data$response, na.rm = TRUE)

  # Reorganize the columns based on what exists
  if ("site" %in% names(data)) {
    data <- data[, c("site", "year", "response", "stand.response")]
  } else {
    data <- data[, c("year", "response", "stand.response")]
  }

  return(data)
}

# Fit linear regression without site data
linefit_no_site <- function(data) {

  # Fit a simple linear regression model to standardized response over time
  model <- lm(stand.response ~ year, data = na.omit(data))

  # Extract model summary
  t.model <- coef(summary(model))
  slope_pval <- summary(model)$coefficients["year", "Pr(>|t|)"]

  # Package key statistics into a clean data frame
  output <- data.frame(
    start_year = min(data$year, na.rm = TRUE),              # First year in the window
    N_data = nrow(data),                                     # Number of observations
    N_years = length(unique(data$year)),                     # Number of unique years
    slope = t.model["year", "Estimate"],                     # Slope estimate
    slope_se = t.model["year", "Std. Error"],                # Slope standard error
    p_value = slope_pval,                                    # p-value for slope
    intercept = t.model["(Intercept)", "Estimate"],          # Intercept estimate
    intercept_se = t.model["(Intercept)", "Std. Error"],     # Intercept standard error
    intercept_p_value = t.model["(Intercept)", "Pr(>|t|)"]   # p-value for intercept
  )

  return(output)
}

# Fit linear regression with site data
linefit_with_site <- function(data) {

  library(lme4)
  library(lmerTest)

  # Remove missing values
  data <- na.omit(data)

  # Count unique sites
  n_sites <- length(unique(data$site))

  # If there are fewer than 3 sites, fallback to simple linear regression
  if (n_sites < 3) {
    return(linefit_no_site(data))
  }

  # Fit a mixed-effects model with random intercepts for site
  model <- lmer(stand.response ~ year + (1 | site), data = data)
  t.model <- coef(summary(model))
  slope_pval <- t.model["year", "Pr(>|t|)"]

  # Return fixed effect estimates in a data frame
  output <- data.frame(
    start_year = min(data$year, na.rm = TRUE),
    N_data = nrow(data),
    N_years = length(unique(data$year)),
    slope = t.model["year", "Estimate"],
    slope_se = t.model["year", "Std. Error"],
    p_value = slope_pval,
    intercept = t.model["(Intercept)", "Estimate"],
    intercept_se = t.model["(Intercept)", "Std. Error"],
    intercept_p_value = t.model["(Intercept)", "Pr(>|t|)"]
  )

  return(output)
}

# Subset data with every combination of sites according to the window size
breakup_systematic <- function(data, window) {

  # Step 1: Initialize an empty output dataframe
  output <- data.frame(
    combo_num = numeric(0),
    N_data = numeric(0),
    N_years = numeric(0),
    N_sites = numeric(0),
    perc_site_removed = numeric(0),
    site_removed = character(0),
    year_slope = numeric(0),
    slope_sd = numeric(0),
    slope_se = numeric(0),
    slope_pval = numeric(0),
    intercept = numeric(0)
  )

  # Step 2: Calculate how many sites to remove for this window
  numsites <- length(unique(data$site))
  perc_site_removed <- round(numsites * (window / 100))

  # Step 3: Generate all combinations of sites to remove
  site_combinations <- combn(unique(data$site), perc_site_removed, simplify = FALSE)

  # Step 4: Loop through each site combination
  for (iterations in 1:length(site_combinations)) {

    chunk <- site_combinations[[iterations]]

    # Step 4a: Remove selected sites
    chunk.1 <- subset(data, !site %in% chunk)

    # Step 4b: Fit model to remaining data
    out <- linefit_with_site(chunk.1)

    # Step 4c: Add site-removal metadata
    out$site_removed <- paste(chunk, collapse = ", ")
    out$combo_num <- iterations
    out$perc_site_removed <- perc_site_removed
    out$N_sites <- length(unique(chunk.1$site))

    # Step 4d: Convert SE to SD
    out$slope_sd <- out$slope_se * sqrt(out$N_data)

    # Step 4e: Append to output
    output <- rbind(output, out)
  }

  # Step 5: Select final columns to return
  output <- dplyr::select(output,
                          combo_num, N_data, N_years, N_sites, perc_site_removed, site_removed,
                          year_slope = slope, slope_sd, slope_se, slope_pval = p_value, intercept
  )

  return(output)
}

# Subset data across all window sizes
multiple_breakups <- function(data) {

  # Step 1: Standardize the data
  data1 <- standardize(data)

  # Step 2: Initialize an empty dataframe
  output <- data.frame(
    combo_num = numeric(0),
    N_data = numeric(0),
    N_years = numeric(0),
    N_sites = character(0),
    perc_site_removed = numeric(0),
    site_removed = character(0),
    year_slope = numeric(0),
    slope_sd = numeric(0),
    slope_se = numeric(0),
    slope_pval = numeric(0),
    intercept = numeric(0)
  )

  # Step 3: Loop over different % site removal windows
  for (i in seq(1, 100, by = 10)) {
    outeach <- breakup_systematic(data1, i)
    output <- rbind(output, outeach)
  }

  return(output)
}

# Calculate minimum site using trend magnitude metric
min_site_magnitude <- function(data, min_percent = 95, error_multiplier = 1) {

  # Step 1: Run multiple site-removal breakups
  test <- multiple_breakups(data)

  # Step 2: Extract the 'true' slope and error (no sites removed)
  true_slope <- unique(subset(test, perc_site_removed == 0)$year_slope)
  true_error <- unique(subset(test, perc_site_removed == 0)$slope_se) * error_multiplier

  # Step 3: Define acceptable slope range
  max_true <- true_slope + true_error
  min_true <- true_slope - true_error

  # Step 4: Get all unique N_sites values
  windows <- unique(test$N_sites)
  stability <- max(windows)
  prop.vec <- c()

  # Step 5: Loop through each N_sites value
  for (i in 1:length(windows)) {
    num_site <- windows[i]
    test_subset <- test[test$N_sites == num_site, ]
    number_of_windows <- nrow(test_subset)

    # Step 5a: Count how many runs have a slope within the "true" range
    correct_subset <- test_subset[
      test_subset$year_slope < max_true &
        test_subset$year_slope > min_true, ]
    number_of_correct <- nrow(correct_subset)

    # Step 5b: Calculate % of runs considered "stable"
    percentage_correct <- 100 * number_of_correct / number_of_windows
    prop.vec <- c(prop.vec, percentage_correct)

    # Step 5c: Update stability threshold if criteria met
    if (percentage_correct >= min_percent && num_site < stability) {
      stability <- num_site
    }
  }

  # Step 6: Return summary dataframe
  df <- data.frame(
    num_site = windows,
    proportion_correct = prop.vec,
    minimum_site = stability
  )

  return(df)
}

# Identify which site provides the most/least unique information and minimum site combinations
usable_sites <- function(data, min_percent = 95, error_multiplier = 1) {

  # Step 1: Run all site-removal combinations and model fits
  test <- multiple_breakups(data)

  # Step 2: Get the "true" slope from the full dataset (no site removal)
  true_slope <- unique(subset(test, perc_site_removed == 0)$year_slope)

  # Step 3: Calculate absolute difference from true slope for each combination
  test$slope_diff <- abs(test$year_slope - true_slope)

  # Step 4: Determine the minimum number of sites required for stability
  site_stability <- min_site_magnitude(data, min_percent = min_percent, error_multiplier = error_multiplier)
  min_sites_required <- min(site_stability$minimum_site, na.rm = TRUE)

  # Step 5: Filter to combinations that used exactly the minimum number of sites
  eligible_combos <- test[test$N_sites == min_sites_required, ]

  # Step 6: Identify the best matching site combo (smallest slope diff)
  best_match_combo <- eligible_combos[which.min(eligible_combos$slope_diff), , drop = FALSE]

  # Step 7: Find the largest and smallest slope diffs for one-site removal only
  one_site_removed <- test[test$perc_site_removed == 1, ]

  largest_diff_one_site <- one_site_removed[which.max(one_site_removed$slope_diff), , drop = FALSE]
  smallest_diff_one_site <- one_site_removed[which.min(one_site_removed$slope_diff), , drop = FALSE]

  # Step 8: Generate full sorted list of slope differences
  all_differences <- test %>%
    dplyr::select(site_removed, N_sites, year_slope, slope_diff) %>%
    dplyr::arrange(desc(slope_diff))

  # Step 9: Return all results in a nested tibble
  result <- tibble::tibble(
    minimum_sites_required = min_sites_required,
    smallest_difference_min_site = list(best_match_combo),
    largest_difference_one_site_removed = list(largest_diff_one_site),
    smallest_difference_one_site_removed = list(smallest_diff_one_site),
    all_slope_differences = list(all_differences)
  )

  return(result)
}