library(bitfield)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

Contributing to Community Standards

The bitfield package enables collaborative development of encoding protocols through the bitfloat/standards repository. This guide shows how to create, test, and contribute protocols that others can reuse.

Setting Up for Contributions

GitHub Authentication

You’ll need a GitHub Personal Access Token to contribute protocols:

# Generate GitHub token (opens browser)
usethis::create_github_token()

# Store token securely in R
gitcreds::gitcreds_set()

# Test your setup
bf_standards(action = "list")

Git Configuration

Configure your Git identity for proper attribution:

usethis::use_git_config(user.name = "Your Name",
                       user.email = "your.email@example.com")

Creating Custom Protocols

Design Principles

Focus on atomic concepts - each protocol should test exactly one thing:

# Example: Create a protocol for data freshness
dataAgeProtocol <- bf_protocol(
  name = "dataAgeDays",
  description = paste("Days since data collection, encoded as 8-bit integer",
                     "(0-255 days) where {daysSince} days are encoded"),
  test = "function(daysSince) { pmin(pmax(round(daysSince), 0), 255) }",
  example = list(daysSince = c(1, 15, 30, 90, 200)),
  type = "int",
  bits = 8
)

# Test the protocol with example data
test_data <- data.frame(
  sample_id = 1:9,
  days_old = c(1, 5, 12, 30, 45, 67, 120, 180, 300)  # Include > 255 to test limit
)

test_reg <- bf_registry(name = "freshness_test", description = "Data freshness testing")
test_reg <- bf_map(protocol = "dataAgeProtocol", data = test_data,
                   daysSince = days_old, registry = test_reg)

test_reg
#>   width 8
#>   flags 1  --------
#> 
#>   pos encoding type            col
#>   1   0.0.8/0  dataAgeProtocol days-old

Testing Your Protocol

Always test thoroughly with edge cases:

# Test encoding/decoding cycle
test_field <- bf_encode(registry = test_reg)
test_decoded <- bf_decode(test_field, registry = test_reg, verbose = FALSE)

# Verify protocol handles edge cases correctly
verification <- data.frame(
  original = test_data$days_old,
  decoded = as.numeric(test_decoded$dataAgeProtocol_days_old),
  capped_at_255 = pmin(test_data$days_old, 255)
)

print(verification)
#>   original  decoded capped_at_255
#> 1        1        1             1
#> 2        5      101             5
#> 3       12     1100            12
#> 4       30    11110            30
#> 5       45   101101            45
#> 6       67  1000011            67
#> 7      120  1111000           120
#> 8      180 10110100           180
#> 9      300 11111111           255

Working with the bf_tbl Example

Use the package’s example data to develop realistic protocols:

# Examine the example data
bf_tbl
#> # A tibble: 9 × 5
#>       x     y commodity yield year 
#>   <dbl> <dbl> <fct>     <dbl> <chr>
#> 1  25.3  59.5 soybean   11.2  2021 
#> 2  27.9  58.1 maize     12.0  NA   
#> 3  27.8  57.8 soybean   13.2  2021r
#> 4  27    59.2 NA         4.43 2021 
#> 5 259   Inf   honey     13.0  2021 
#> 6  27.3  59.1 maize      8.55 2021 
#> 7  26.1  58.4 soybean   11.3  2021 
#> 8  26.5 NaN   maize     10.6  2021 
#> 9   0     0   soybean    9.01 2021

# Create a protocol for yield reliability based on coordinate quality
yieldReliability <- bf_protocol(
  name = "yieldReliability",
  description = paste("Yield reliability score (0-7) based on coordinate quality.",
                     "Higher scores indicate more reliable yield measurements.",
                     "Coordinates {x}, {y} are evaluated for validity."),
  test = "function(x, y) { ifelse(is.na(x) | is.na(y), 0L, ifelse(x == 0 & y == 0, 1L, ifelse(is.infinite(x) | is.infinite(y), 2L, 3L))) }",
  example = list(x = c(25.3, 0, 259), y = c(59.5, 0, Inf)),
  type = "int",
  bits = 3  # 0-7 range needs 3 bits
)

# Test with the example data
reliability_reg <- bf_registry(name = "yield_reliability_test",
                              description = "Test yield reliability protocol")
reliability_reg <- bf_map(protocol = "yieldReliability", data = bf_tbl,
                         x = x, y = y, registry = reliability_reg)

reliability_field <- bf_encode(registry = reliability_reg)
reliability_decoded <- bf_decode(reliability_field, registry = reliability_reg, verbose = FALSE)

# Check results - first see what columns exist
print("Available columns:")
#> [1] "Available columns:"
print(names(reliability_decoded))
#> [1] "yieldReliability_x-y"

result_check <- bf_tbl %>%
  bind_cols(reliability_decoded)

print(result_check)
#> # A tibble: 9 × 6
#>       x     y commodity yield year  `yieldReliability_x-y`
#>   <dbl> <dbl> <fct>     <dbl> <chr> <chr>                 
#> 1  25.3  59.5 soybean   11.2  2021  110                   
#> 2  27.9  58.1 maize     12.0  NA    110                   
#> 3  27.8  57.8 soybean   13.2  2021r 110                   
#> 4  27    59.2 NA         4.43 2021  110                   
#> 5 259   Inf   honey     13.0  2021  100                   
#> 6  27.3  59.1 maize      8.55 2021  110                   
#> 7  26.1  58.4 soybean   11.3  2021  110                   
#> 8  26.5 NaN   maize     10.6  2021  000                   
#> 9   0     0   soybean    9.01 2021  010

Protocol Versioning and Extension

When improving existing protocols, use versioning:

# Enhanced version of a basic protocol
enhancedNaCheck <- bf_protocol(
  name = "enhancedNaCheck",
  description = paste("Enhanced missing value detection for {x}.",
                     "Detects NA, empty strings, and placeholder values (-999, -99)."),
  test = "function(x) { as.integer(is.na(x) | (is.character(x) & x == '') | (is.numeric(x) & x %in% c(-999, -99, 9999))) }",
  example = list(x = c("valid", "", NA, -999, "normal")),
  type = "int",
  bits = 1,
  version = "1.1.0",
  extends = "na_1.0.0",
  note = "Enhanced to detect placeholder values and empty strings"
)

Efficient Categorical Encoding

Handle categories efficiently by understanding your data:

# Check categories in example data
unique(bf_tbl$commodity)
#> [1] soybean maize   <NA>    honey  
#> Levels: honey maize soybean

# Create optimized categorical protocol
commodityProtocol <- bf_protocol(
  name = "agriculturalCommodity",
  description = paste("Agricultural commodity classification with",
                     "3-bit encoding (0-7): 1=soybean, 2=maize, 3=honey, 0=NA/unknown"),
  test = "function(commodity) { ifelse(is.na(commodity), 0L, ifelse(commodity == 'soybean', 1L, ifelse(commodity == 'maize', 2L, ifelse(commodity == 'honey', 3L, 0L)))) }",
  example = list(commodity = c("soybean", "maize", "honey", NA, "unknown")),
  type = "int",
  bits = 3
)

# Test the categorical protocol
cat_reg <- bf_registry(name = "commodity_test", description = "Commodity encoding test")
cat_reg <- bf_map(protocol = "commodityProtocol", data = bf_tbl,
                  commodity = commodity, registry = cat_reg)

cat_field <- bf_encode(registry = cat_reg)
cat_decoded <- bf_decode(cat_field, registry = cat_reg, verbose = FALSE)

commodity_check <- bf_tbl %>%
  bind_cols(cat_decoded) %>%
  select(commodity, commodityProtocol_commodity)

print(commodity_check)
#> # A tibble: 9 × 2
#>   commodity commodityProtocol_commodity
#>   <fct>     <chr>                      
#> 1 soybean   010                        
#> 2 maize     100                        
#> 3 soybean   010                        
#> 4 NA        000                        
#> 5 honey     110                        
#> 6 maize     100                        
#> 7 soybean   010                        
#> 8 maize     100                        
#> 9 soybean   010

Contributing to the Community Repository

Once your protocol is tested and documented:

# Push to community standards
bf_standards(
  protocol = data_age_protocol,
  remote = "environmental/temporal",
  action = "push",
  version = "1.0.0",
  change = "Initial release: data age encoding for environmental monitoring"
)

# Push agricultural commodity protocol
bf_standards(
  protocol = commodity_protocol,
  remote = "agricultural/crops",
  action = "push",
  version = "1.0.0",
  change = "Agricultural commodity encoding optimized for common crops"
)

Using Community Protocols

Pull and use protocols contributed by others:

# List available protocols
available_protocols <- bf_standards(action = "list")

# Pull a specific protocol
soil_moisture <- bf_standards(
  protocol = "soil_moisture",
  remote = "environmental/soil",
  action = "pull"
)

# Use in your analysis
soil_reg <- bf_registry(name = "soil_analysis", description = "Soil moisture analysis")
soil_reg <- bf_map(protocol = soil_moisture, data = my_soil_data,
                  valMoisture = moisture_percent, registry = soil_reg)

Quality Assurance Checklist

Before contributing protocols, ensure they meet quality standards:

  • ✓ Atomic design: One concept per protocol
  • ✓ Edge case testing: Handle NA, Inf, extreme values appropriately
  • ✓ Clear documentation: Describe what the protocol does and how
  • ✓ Efficient encoding: Use appropriate bit width for data range
  • ✓ Round-trip verification: Encode/decode cycle preserves information
  • ✓ Example data: Include realistic test cases
  • ✓ Proper versioning: Use semantic versioning for updates

Getting Help

Contributing protocols helps build standardized metadata practices across scientific domains, enabling better cross-workflow integration and data reusability.