vignettes/community-contributions.Rmd
community-contributions.Rmd
library(bitfield)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, unionThe bitfield package enables collaborative development
of encoding protocols through the bitfloat/standards
repository. This guide shows how to create, test, and contribute
protocols that others can reuse.
You’ll need a GitHub Personal Access Token to contribute protocols:
# Generate GitHub token (opens browser)
usethis::create_github_token()
# Store token securely in R
gitcreds::gitcreds_set()
# Test your setup
bf_standards(action = "list")Focus on atomic concepts - each protocol should test exactly one thing:
# Example: Create a protocol for data freshness
dataAgeProtocol <- bf_protocol(
name = "dataAgeDays",
description = paste("Days since data collection, encoded as 8-bit integer",
"(0-255 days) where {daysSince} days are encoded"),
test = "function(daysSince) { pmin(pmax(round(daysSince), 0), 255) }",
example = list(daysSince = c(1, 15, 30, 90, 200)),
type = "int",
bits = 8
)
# Test the protocol with example data
test_data <- data.frame(
sample_id = 1:9,
days_old = c(1, 5, 12, 30, 45, 67, 120, 180, 300) # Include > 255 to test limit
)
test_reg <- bf_registry(name = "freshness_test", description = "Data freshness testing")
test_reg <- bf_map(protocol = "dataAgeProtocol", data = test_data,
daysSince = days_old, registry = test_reg)
test_reg
#> width 8
#> flags 1 --------
#>
#> pos encoding type col
#> 1 0.0.8/0 dataAgeProtocol days-oldAlways test thoroughly with edge cases:
# Test encoding/decoding cycle
test_field <- bf_encode(registry = test_reg)
test_decoded <- bf_decode(test_field, registry = test_reg, verbose = FALSE)
# Verify protocol handles edge cases correctly
verification <- data.frame(
original = test_data$days_old,
decoded = as.numeric(test_decoded$dataAgeProtocol_days_old),
capped_at_255 = pmin(test_data$days_old, 255)
)
print(verification)
#> original decoded capped_at_255
#> 1 1 1 1
#> 2 5 101 5
#> 3 12 1100 12
#> 4 30 11110 30
#> 5 45 101101 45
#> 6 67 1000011 67
#> 7 120 1111000 120
#> 8 180 10110100 180
#> 9 300 11111111 255Use the package’s example data to develop realistic protocols:
# Examine the example data
bf_tbl
#> # A tibble: 9 × 5
#> x y commodity yield year
#> <dbl> <dbl> <fct> <dbl> <chr>
#> 1 25.3 59.5 soybean 11.2 2021
#> 2 27.9 58.1 maize 12.0 NA
#> 3 27.8 57.8 soybean 13.2 2021r
#> 4 27 59.2 NA 4.43 2021
#> 5 259 Inf honey 13.0 2021
#> 6 27.3 59.1 maize 8.55 2021
#> 7 26.1 58.4 soybean 11.3 2021
#> 8 26.5 NaN maize 10.6 2021
#> 9 0 0 soybean 9.01 2021
# Create a protocol for yield reliability based on coordinate quality
yieldReliability <- bf_protocol(
name = "yieldReliability",
description = paste("Yield reliability score (0-7) based on coordinate quality.",
"Higher scores indicate more reliable yield measurements.",
"Coordinates {x}, {y} are evaluated for validity."),
test = "function(x, y) { ifelse(is.na(x) | is.na(y), 0L, ifelse(x == 0 & y == 0, 1L, ifelse(is.infinite(x) | is.infinite(y), 2L, 3L))) }",
example = list(x = c(25.3, 0, 259), y = c(59.5, 0, Inf)),
type = "int",
bits = 3 # 0-7 range needs 3 bits
)
# Test with the example data
reliability_reg <- bf_registry(name = "yield_reliability_test",
description = "Test yield reliability protocol")
reliability_reg <- bf_map(protocol = "yieldReliability", data = bf_tbl,
x = x, y = y, registry = reliability_reg)
reliability_field <- bf_encode(registry = reliability_reg)
reliability_decoded <- bf_decode(reliability_field, registry = reliability_reg, verbose = FALSE)
# Check results - first see what columns exist
print("Available columns:")
#> [1] "Available columns:"
print(names(reliability_decoded))
#> [1] "yieldReliability_x-y"
result_check <- bf_tbl %>%
bind_cols(reliability_decoded)
print(result_check)
#> # A tibble: 9 × 6
#> x y commodity yield year `yieldReliability_x-y`
#> <dbl> <dbl> <fct> <dbl> <chr> <chr>
#> 1 25.3 59.5 soybean 11.2 2021 110
#> 2 27.9 58.1 maize 12.0 NA 110
#> 3 27.8 57.8 soybean 13.2 2021r 110
#> 4 27 59.2 NA 4.43 2021 110
#> 5 259 Inf honey 13.0 2021 100
#> 6 27.3 59.1 maize 8.55 2021 110
#> 7 26.1 58.4 soybean 11.3 2021 110
#> 8 26.5 NaN maize 10.6 2021 000
#> 9 0 0 soybean 9.01 2021 010When improving existing protocols, use versioning:
# Enhanced version of a basic protocol
enhancedNaCheck <- bf_protocol(
name = "enhancedNaCheck",
description = paste("Enhanced missing value detection for {x}.",
"Detects NA, empty strings, and placeholder values (-999, -99)."),
test = "function(x) { as.integer(is.na(x) | (is.character(x) & x == '') | (is.numeric(x) & x %in% c(-999, -99, 9999))) }",
example = list(x = c("valid", "", NA, -999, "normal")),
type = "int",
bits = 1,
version = "1.1.0",
extends = "na_1.0.0",
note = "Enhanced to detect placeholder values and empty strings"
)Handle categories efficiently by understanding your data:
# Check categories in example data
unique(bf_tbl$commodity)
#> [1] soybean maize <NA> honey
#> Levels: honey maize soybean
# Create optimized categorical protocol
commodityProtocol <- bf_protocol(
name = "agriculturalCommodity",
description = paste("Agricultural commodity classification with",
"3-bit encoding (0-7): 1=soybean, 2=maize, 3=honey, 0=NA/unknown"),
test = "function(commodity) { ifelse(is.na(commodity), 0L, ifelse(commodity == 'soybean', 1L, ifelse(commodity == 'maize', 2L, ifelse(commodity == 'honey', 3L, 0L)))) }",
example = list(commodity = c("soybean", "maize", "honey", NA, "unknown")),
type = "int",
bits = 3
)
# Test the categorical protocol
cat_reg <- bf_registry(name = "commodity_test", description = "Commodity encoding test")
cat_reg <- bf_map(protocol = "commodityProtocol", data = bf_tbl,
commodity = commodity, registry = cat_reg)
cat_field <- bf_encode(registry = cat_reg)
cat_decoded <- bf_decode(cat_field, registry = cat_reg, verbose = FALSE)
commodity_check <- bf_tbl %>%
bind_cols(cat_decoded) %>%
select(commodity, commodityProtocol_commodity)
print(commodity_check)
#> # A tibble: 9 × 2
#> commodity commodityProtocol_commodity
#> <fct> <chr>
#> 1 soybean 010
#> 2 maize 100
#> 3 soybean 010
#> 4 NA 000
#> 5 honey 110
#> 6 maize 100
#> 7 soybean 010
#> 8 maize 100
#> 9 soybean 010Once your protocol is tested and documented:
# Push to community standards
bf_standards(
protocol = data_age_protocol,
remote = "environmental/temporal",
action = "push",
version = "1.0.0",
change = "Initial release: data age encoding for environmental monitoring"
)
# Push agricultural commodity protocol
bf_standards(
protocol = commodity_protocol,
remote = "agricultural/crops",
action = "push",
version = "1.0.0",
change = "Agricultural commodity encoding optimized for common crops"
)Pull and use protocols contributed by others:
# List available protocols
available_protocols <- bf_standards(action = "list")
# Pull a specific protocol
soil_moisture <- bf_standards(
protocol = "soil_moisture",
remote = "environmental/soil",
action = "pull"
)
# Use in your analysis
soil_reg <- bf_registry(name = "soil_analysis", description = "Soil moisture analysis")
soil_reg <- bf_map(protocol = soil_moisture, data = my_soil_data,
valMoisture = moisture_percent, registry = soil_reg)Before contributing protocols, ensure they meet quality standards:
Contributing protocols helps build standardized metadata practices across scientific domains, enabling better cross-workflow integration and data reusability.