This guide provides practical recommendations for effective bitfield
design, protocol selection, and avoiding common pitfalls when using the
bitfield package.
Before creating bitfields, identify what metadata needs to travel between workflows:
# Example: Quality assessment for agricultural data
bf_tbl
#> # A tibble: 9 × 5
#> x y commodity yield year
#> <dbl> <dbl> <fct> <dbl> <chr>
#> 1 25.3 59.5 soybean 11.2 2021
#> 2 27.9 58.1 maize 12.0 NA
#> 3 27.8 57.8 soybean 13.2 2021r
#> 4 27 59.2 NA 4.43 2021
#> 5 259 Inf honey 13.0 2021
#> 6 27.3 59.1 maize 8.55 2021
#> 7 26.1 58.4 soybean 11.3 2021
#> 8 26.5 NaN maize 10.6 2021
#> 9 0 0 soybean 9.01 2021This dataset has several quality issues: - Missing values
(NA in commodity and year columns) - Invalid coordinates
(x=259, y=Inf, NaN) - Suspicious data points (x=0, y=0) - Mixed data
formats (year “2021r”)
Select protocols that balance information content with bit efficiency:
reg <- bf_registry(name = "quality_check",
description = "Agricultural data quality assessment")
# Detect missing commodities (1 bit)
reg <- bf_map(protocol = "na", data = bf_tbl, x = commodity, registry = reg)
# Check for infinite coordinates (1 bit each)
reg <- bf_map(protocol = "inf", data = bf_tbl, x = x, registry = reg)
reg <- bf_map(protocol = "inf", data = bf_tbl, x = y, registry = reg)
# Check for suspicious zero coordinates (1 bit)
reg <- bf_map(protocol = "matches", data = bf_tbl, x = x, registry = reg, set = 0)
reg
#> width 4
#> flags 4 -|-|-|-
#>
#> pos encoding type col
#> 1 0.0.1/0 na commodity
#> 2 0.0.1/0 inf x
#> 3 0.0.1/0 inf y
#> 4 0.0.1/0 matches xEach protocol should test exactly one concept for maximum reusability:
# Good: Separate protocols for different quality checks
reg2 <- bf_registry(name = "detailed_quality", description = "Detailed quality flags")
reg2 <- bf_map(protocol = "na", data = bf_tbl, x = commodity, registry = reg2)
reg2 <- bf_map(protocol = "range", data = bf_tbl, x = yield, registry = reg2,
min = 0, max = 20, na.val = 1L) # Reasonable yield range
reg2 <- bf_map(protocol = "grepl", data = bf_tbl, x = year, registry = reg2,
pattern = "[0-9]{4}$", na.val = 0L) # Valid 4-digit year
reg2
#> width 3
#> flags 3 -|-|-
#>
#> pos encoding type col
#> 1 0.0.1/0 na commodity
#> 2 0.0.1/0 range yield
#> 3 0.0.1/0 grepl yearDesign bitfields thinking about what information downstream users need:
# Encode with sufficient precision for downstream analysis
field <- bf_encode(registry = reg2)
decoded <- bf_decode(field, registry = reg2, verbose = FALSE)
# Combine with original data for downstream use
result <- bf_tbl %>%
bind_cols(decoded) %>%
bind_cols(bf_int = field$bf_int1)
head(result, 3)
#> # A tibble: 3 × 9
#> x y commodity yield year na_commodity range_yield grepl_year bf_int
#> <dbl> <dbl> <fct> <dbl> <chr> <chr> <chr> <chr> <int>
#> 1 25.3 59.5 soybean 11.2 2021 0 1 1 3
#> 2 27.9 58.1 maize 12.0 NA 0 1 0 2
#> 3 27.8 57.8 soybean 13.2 2021r 0 1 0 2Problem: Trying to encode too much information in a single bitfield.
Solution: Focus on essential metadata that enables informed decisions:
# Simple but effective approach
simple_reg <- bf_registry(name = "essential", description = "Essential quality flags")
simple_reg <- bf_map(protocol = "na", data = bf_tbl, x = commodity, registry = simple_reg)
simple_reg <- bf_map(protocol = "numeric", data = bf_tbl, x = yield, registry = simple_reg,
format = "half") # Preserve yield with limited precision
simple_reg
#> width 17
#> flags 2 -|----------------
#>
#> pos encoding type col
#> 1 0.0.1/0 na commodity
#> 2 1.5.10/15 numeric yieldProblem: Not testing protocols with edge cases.
Solution: Always test with problematic data:
# Test with the problematic rows from bf_tbl
problematic_rows <- bf_tbl[c(4, 5, 8, 9), ] # NA, Inf, NaN, zeros
print(problematic_rows)
#> # A tibble: 4 × 5
#> x y commodity yield year
#> <dbl> <dbl> <fct> <dbl> <chr>
#> 1 27 59.2 NA 4.43 2021
#> 2 259 Inf honey 13.0 2021
#> 3 26.5 NaN maize 10.6 2021
#> 4 0 0 soybean 9.01 2021
test_reg <- bf_registry(name = "edge_test", description = "Edge case testing")
test_reg <- bf_map(protocol = "na", data = problematic_rows, x = commodity, registry = test_reg)
test_reg <- bf_map(protocol = "inf", data = problematic_rows, x = x, registry = test_reg)
# Verify encoding/decoding works
test_field <- bf_encode(registry = test_reg)
test_decoded <- bf_decode(test_field, registry = test_reg, verbose = FALSE)
test_decoded
#> # A tibble: 4 × 2
#> na_commodity inf_x
#> <chr> <chr>
#> 1 1 0
#> 2 0 0
#> 3 0 0
#> 4 0 0Problem: Using too many bits for unnecessary precision.
Solution: Match precision to actual needs:
# Compare different numeric encoding approaches
reg_precise <- bf_registry(name = "precise", description = "High precision encoding")
reg_efficient <- bf_registry(name = "efficient", description = "Efficient encoding")
# High precision (16 bits)
reg_precise <- bf_map(protocol = "numeric", data = bf_tbl, x = yield,
registry = reg_precise, format = "half")
# More efficient (using half precision)
reg_efficient <- bf_map(protocol = "numeric", data = bf_tbl, x = yield,
registry = reg_efficient, format = "half")
cat("Precise encoding:", reg_precise@width, "bits\n")
#> Precise encoding: 16 bits
cat("Efficient encoding:", reg_efficient@width, "bits\n")
#> Efficient encoding: 16 bitsHandle categorical variables efficiently:
# Check unique categories first
unique(bf_tbl$commodity)
#> [1] soybean maize <NA> honey
#> Levels: honey maize soybean
# Encode categories (automatically determines bit needs)
cat_reg <- bf_registry(name = "categories", description = "Categorical encoding")
cat_reg <- bf_map(protocol = "category", data = bf_tbl, x = commodity,
registry = cat_reg, na.val = 0L)
cat_reg
#> width 2
#> flags 1 --
#>
#> pos encoding type col
#> 1 0.0.2/0 category commodity
# Well-documented registry
final_reg <- bf_registry(
name = "agricultural_qa",
description = "Agricultural data quality assessment including missing value detection, coordinate validation, and yield encoding with 8-bit precision"
)
final_reg <- bf_map(protocol = "na", data = bf_tbl, x = commodity, registry = final_reg)
final_reg <- bf_map(protocol = "numeric", data = bf_tbl, x = yield, registry = final_reg,
format = "half")
final_reg
#> width 17
#> flags 2 -|----------------
#>
#> pos encoding type col
#> 1 0.0.1/0 na commodity
#> 2 1.5.10/15 numeric yield
# Always test encode/decode cycle
final_field <- bf_encode(registry = final_reg)
final_decoded <- bf_decode(final_field, registry = final_reg, verbose = FALSE)
# Check that important information is preserved
original_na_count <- sum(is.na(bf_tbl$commodity))
decoded_na_count <- sum(final_decoded$na_commodity == "1")
cat("Original NA count:", original_na_count, "\n")
#> Original NA count: 1
cat("Decoded NA count:", decoded_na_count, "\n")
#> Decoded NA count: 1
cat("Information preserved:", original_na_count == decoded_na_count, "\n")
#> Information preserved: TRUEFollowing these practices ensures your bitfields effectively bridge workflows while maintaining data integrity and computational efficiency.