Title: | Methods for Image-Based Cell Profiling |
---|---|
Description: | `cytominer` is a suite of common functions used to process high-dimensional readouts from image-based cell profiling experiments. |
Authors: | Tim Becker [aut], Allen Goodman [aut], Claire McQuin [aut], Mohammad Rohban [aut], Shantanu Singh [aut, cre] |
Maintainer: | Shantanu Singh <[email protected]> |
License: | BSD_3_clause + file LICENSE |
Version: | 0.2.2.9000 |
Built: | 2025-02-11 05:13:48 UTC |
Source: | https://github.com/cytomining/cytominer |
aggregate
aggregates data based on the specified aggregation method.
aggregate( population, variables, strata, operation = "mean", univariate = TRUE, ... )
aggregate( population, variables, strata, operation = "mean", univariate = TRUE, ... )
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
strata |
character vector specifying grouping variables for aggregation. |
operation |
optional character string specifying method for aggregation,
e.g. |
univariate |
boolean specifying whether the aggregation function is univariate or multivariate. |
... |
optional arguments passed to aggregation operation |
aggregated data of the same class as population
.
population <- tibble::tibble( Metadata_group = c( "control", "control", "control", "control", "experiment", "experiment", "experiment", "experiment" ), Metadata_batch = c("a", "a", "b", "b", "a", "a", "b", "b"), Area = c(10, 12, 15, 16, 8, 8, 7, 7), Intensity = c(3, -3, 35, -3, 3, 0, 9, -7) ) variables <- c("Area", "Intensity") strata <- c("Metadata_group", "Metadata_batch") aggregate(population, variables, strata, operation = "mean") aggregate(population, variables, strata, operation = "mean+sd") aggregate(population, variables, strata, operation = "median") aggregate(population, variables, strata, operation = "covariance", univariate = FALSE)
population <- tibble::tibble( Metadata_group = c( "control", "control", "control", "control", "experiment", "experiment", "experiment", "experiment" ), Metadata_batch = c("a", "a", "b", "b", "a", "a", "b", "b"), Area = c(10, 12, 15, 16, 8, 8, 7, 7), Intensity = c(3, -3, 35, -3, 3, 0, 9, -7) ) variables <- c("Area", "Intensity") strata <- c("Metadata_group", "Metadata_batch") aggregate(population, variables, strata, operation = "mean") aggregate(population, variables, strata, operation = "mean+sd") aggregate(population, variables, strata, operation = "median") aggregate(population, variables, strata, operation = "covariance", univariate = FALSE)
correlation_threshold
returns list of variables such that no two
variables have a correlation greater than a specified threshold.
correlation_threshold(variables, sample, cutoff = 0.9, method = "pearson")
correlation_threshold(variables, sample, cutoff = 0.9, method = "pearson")
variables |
character vector specifying observation variables. |
sample |
tbl containing sample used to estimate parameters. |
cutoff |
threshold between [0,1] that defines the minimum correlation of a selected feature. |
method |
optional character string specifying method for calculating
correlation. This must be one of the strings |
correlation_threshold
is a wrapper for caret::findCorrelation
.
character vector specifying observation variables to be excluded.
suppressMessages(suppressWarnings(library(magrittr))) sample <- tibble::tibble( x = rnorm(30), y = rnorm(30) / 1000 ) sample %<>% dplyr::mutate(z = x + rnorm(30) / 10) variables <- c("x", "y", "z") head(sample) cor(sample) # `x` and `z` are highly correlated; one of them will be removed correlation_threshold(variables, sample)
suppressMessages(suppressWarnings(library(magrittr))) sample <- tibble::tibble( x = rnorm(30), y = rnorm(30) / 1000 ) sample %<>% dplyr::mutate(z = x + rnorm(30) / 10) variables <- c("x", "y", "z") head(sample) cor(sample) # `x` and `z` are highly correlated; one of them will be removed correlation_threshold(variables, sample)
NA
s per variable.count_na_rows
counts the number of NA
s per variable.
count_na_rows(population, variables)
count_na_rows(population, variables)
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
data frame with frequency of NA
s per variable.
population <- tibble::tibble( Metadata_group = c( "control", "control", "control", "control", "experiment", "experiment", "experiment", "experiment" ), Metadata_batch = c("a", "a", "b", "b", "a", "a", "b", "b"), AreaShape_Area = c(10, 12, 15, 16, 8, 8, 7, 7), AreaShape_length = c(2, 3, NA, NA, 4, 5, 1, 5) ) variables <- c("AreaShape_Area", "AreaShape_length") count_na_rows(population, variables)
population <- tibble::tibble( Metadata_group = c( "control", "control", "control", "control", "experiment", "experiment", "experiment", "experiment" ), Metadata_batch = c("a", "a", "b", "b", "a", "a", "b", "b"), AreaShape_Area = c(10, 12, 15, 16, 8, 8, 7, 7), AreaShape_length = c(2, 3, NA, NA, 4, 5, 1, 5) ) variables <- c("AreaShape_Area", "AreaShape_length") count_na_rows(population, variables)
covariance
computes the covariance matrix and vectorize it.
covariance(population, variables)
covariance(population, variables)
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
data frame of 1 row comprising vectorized covariance matrix.
population <- tibble::tibble( x = rnorm(30), y = rnorm(30), z = rnorm(30) ) variables <- c("x", "y") covariance(population, variables)
population <- tibble::tibble( x = rnorm(30), y = rnorm(30), z = rnorm(30) ) variables <- c("x", "y") covariance(population, variables)
drop_na_columns
returns list of variables which have greater than a
specified threshold number of NA
s.
drop_na_columns(population, variables, cutoff = 0.05)
drop_na_columns(population, variables, cutoff = 0.05)
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
cutoff |
threshold between [0,1]. Variables with an |
character vector specifying observation variables to be excluded.
population <- tibble::tibble( Metadata_group = c( "control", "control", "control", "control", "experiment", "experiment", "experiment", "experiment" ), Metadata_batch = c("a", "a", "b", "b", "a", "a", "b", "b"), AreaShape_Area = c(10, 12, 15, 16, 8, 8, 7, 7), AreaShape_Length = c(2, 3, NA, NA, 4, 5, 1, 5) ) variables <- c("AreaShape_Area", "AreaShape_Length") drop_na_columns(population, variables)
population <- tibble::tibble( Metadata_group = c( "control", "control", "control", "control", "experiment", "experiment", "experiment", "experiment" ), Metadata_batch = c("a", "a", "b", "b", "a", "a", "b", "b"), AreaShape_Area = c(10, 12, 15, 16, 8, 8, 7, 7), AreaShape_Length = c(2, 3, NA, NA, 4, 5, 1, 5) ) variables <- c("AreaShape_Area", "AreaShape_Length") drop_na_columns(population, variables)
NA
in all specified variables.drop_na_rows
drops rows that are NA
in all specified variables.
drop_na_rows(population, variables)
drop_na_rows(population, variables)
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
population
without rows that have NA
in all specified
variables.
population <- tibble::tibble( Metadata_group = c( "control", "control", "control", "control", "experiment", "experiment", "experiment", "experiment" ), Metadata_batch = c("a", "a", "b", "b", "a", "a", "b", "b"), AreaShape_Area = c(10, 12, NA, 16, 8, 8, 7, 7), AreaShape_Length = c(2, 3, NA, NA, 4, 5, 1, 5) ) variables <- c("AreaShape_Area", "AreaShape_Length") drop_na_rows(population, variables)
population <- tibble::tibble( Metadata_group = c( "control", "control", "control", "control", "experiment", "experiment", "experiment", "experiment" ), Metadata_batch = c("a", "a", "b", "b", "a", "a", "b", "b"), AreaShape_Area = c(10, 12, NA, 16, 8, 8, 7, 7), AreaShape_Length = c(2, 3, NA, NA, 4, 5, 1, 5) ) variables <- c("AreaShape_Area", "AreaShape_Length") drop_na_rows(population, variables)
extract_subpopulations
identifies clusters in the reference and
population sets and reports the frequency of points in each cluster for the
two sets.
extract_subpopulations(population, reference, variables, k)
extract_subpopulations(population, reference, variables, k)
population |
tbl with grouping (metadata) and observation variables. |
reference |
tbl with grouping (metadata) and observation variables.
Columns of |
variables |
character vector specifying observation variables. |
k |
scalar specifying number of clusters. |
list containing clusters centers (subpop_centers
), two
normalized histograms specifying frequency of each clusters in population
and reference (subpop_profiles
), and cluster prediction and distance
to the predicted cluster for all input data (population_clusters
and
reference_clusters
).
data <- tibble::tibble( Metadata_group = c( "control", "control", "control", "control", "experiment", "experiment", "experiment", "experiment" ), AreaShape_Area = c(10, 12, NA, 16, 8, 8, 7, 7), AreaShape_Length = c(2, 3, NA, NA, 4, 5, 1, 5) ) variables <- c("AreaShape_Area", "AreaShape_Length") population <- dplyr::filter(data, Metadata_group == "experiment") reference <- dplyr::filter(data, Metadata_group == "control") extract_subpopulations( population = population, reference = reference, variables = variables, k = 3 )
data <- tibble::tibble( Metadata_group = c( "control", "control", "control", "control", "experiment", "experiment", "experiment", "experiment" ), AreaShape_Area = c(10, 12, NA, 16, 8, 8, 7, 7), AreaShape_Length = c(2, 3, NA, NA, 4, 5, 1, 5) ) variables <- c("AreaShape_Area", "AreaShape_Length") population <- dplyr::filter(data, Metadata_group == "experiment") reference <- dplyr::filter(data, Metadata_group == "control") extract_subpopulations( population = population, reference = reference, variables = variables, k = 3 )
find_significant_pcs
finds significant PC's given the eigenvalues.
find_significant_pcs(S, method = "outlier", n = NULL, d = NULL)
find_significant_pcs(S, method = "outlier", n = NULL, d = NULL)
S |
numeric vector with eigenvalues of covariance matrix, sorted in descending order. |
method |
optional string specifying method to estimate number of significant PCs |
n |
optional integer specifying number of rows in the data matrix.
Default is |
d |
optional integer specifying number of columns in the data matrix.
Default is |
number of significant PCs
generalized_log
transforms specified observation variables using
.
generalized_log(population, variables, offset = 1)
generalized_log(population, variables, offset = 1)
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
offset |
optional offset parameter for the transformation. |
transformed data of the same class as population
.
population <- tibble::tibble( Metadata_Well = c("A01", "A02", "B01", "B02"), Intensity_DNA = c(8, 20, 12, 32) ) variables <- c("Intensity_DNA") generalized_log(population, variables)
population <- tibble::tibble( Metadata_Well = c("A01", "A02", "B01", "B02"), Intensity_DNA = c(8, 20, 12, 32) ) variables <- c("Intensity_DNA") generalized_log(population, variables)
generate_component_matrix
generates the sparse random component matrix
for performing sparse random projection. If density
is the density of
the sparse matrix and n_components
is the size of the projected space,
the elements of the random matrix are drawn from
generate_component_matrix(n_features, n_components, density)
generate_component_matrix(n_features, n_components, density)
n_features |
the dimensionality of the original space. |
n_components |
the dimensionality of the projected space. |
density |
the density of the sparse random matrix. |
-sqrt(1 / (density * n_components))
with probability density / 2
0
with probability 1 - density
sqrt(1 / (density * n_components))
with probability density / 2
A sparse random matrix of size (n_features, n_components)
.
M <- generate_component_matrix(500, 100, 0.3) M[1:10, 1:10]
M <- generate_component_matrix(500, 100, 0.3) M[1:10, 1:10]
husk
detects unwanted variation in the sample and removes it from the
population.
husk( population, variables, sample, remove_outliers = TRUE, epsilon = 1e-06, remove_signal = TRUE, flatten_noise = TRUE )
husk( population, variables, sample, remove_outliers = TRUE, epsilon = 1e-06, remove_signal = TRUE, flatten_noise = TRUE )
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
sample |
tbl containing sample that is used by the method to estimate
husking parameters. |
remove_outliers |
optional boolean specifying whether to remove
outliers. Default is |
epsilon |
optional parameter used in husking to offset
eigenvalues to avoid division by zero. Default is |
remove_signal |
optional boolean specifying whether to husk the signal
instead of only scaling it down. Default is |
flatten_noise |
optional boolean specifying whether to flatten the noise
instead of scaling it up. Default is |
transformed data of the same class as population
.
population <- tibble::tibble( Metadata_pert_name = c(NA, NA, NA, NA), Metadata_Well = c("A01", "A02", "B01", "B02"), Intensity_DNA = c(10, 20, 12, 32), Granularity_DNA = c(22, 20, NA, 32), Texture_DNA = c(5, 2, 43, 13) ) variables <- c("Intensity_DNA", "Texture_DNA") husk(population, variables, population, epsilon = 1, remove_signal = TRUE) husk(population, variables, population, epsilon = 1e-5, remove_signal = TRUE) husk(population, variables, population, epsilon = 1, remove_signal = FALSE) husk(population, variables, population, epsilon = 1e-5, remove_signal = FALSE)
population <- tibble::tibble( Metadata_pert_name = c(NA, NA, NA, NA), Metadata_Well = c("A01", "A02", "B01", "B02"), Intensity_DNA = c(10, 20, 12, 32), Granularity_DNA = c(22, 20, NA, 32), Texture_DNA = c(5, 2, 43, 13) ) variables <- c("Intensity_DNA", "Texture_DNA") husk(population, variables, population, epsilon = 1, remove_signal = TRUE) husk(population, variables, population, epsilon = 1e-5, remove_signal = TRUE) husk(population, variables, population, epsilon = 1, remove_signal = FALSE) husk(population, variables, population, epsilon = 1e-5, remove_signal = FALSE)
mark_outlier_rows
drops outlier rows.
mark_outlier_rows( population, variables, sample, method = "svd+iqr", outlier_col = "is_outlier", ... )
mark_outlier_rows( population, variables, sample, method = "svd+iqr", outlier_col = "is_outlier", ... )
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
sample |
tbl containing sample that is used by outlier removal methods
to estimate parameters. |
method |
optional character string specifying method for outlier
removal. There is currently only one option ( |
outlier_col |
optional character string specifying the name for the
column that will indicate outliers (in the output).
Default |
... |
arguments passed to outlier removal method. |
population
with an extra column is_outlier
.
suppressMessages(suppressWarnings(library(magrittr))) population <- tibble::tibble( Metadata_group = sample(c("a", "b"), 100, replace = TRUE), Metadata_type = sample(c("control", "trt"), 100, replace = TRUE), AreaShape_Area = c(rnorm(98), 20, 30), AreaShape_Eccentricity = rnorm(100) ) variables <- c("AreaShape_Area", "AreaShape_Eccentricity") sample <- population %>% dplyr::filter(Metadata_type == "control") population_marked <- cytominer::mark_outlier_rows( population, variables, sample, method = "svd+iqr" ) population_marked %>% dplyr::group_by(is_outlier) %>% dplyr::sample_n(3)
suppressMessages(suppressWarnings(library(magrittr))) population <- tibble::tibble( Metadata_group = sample(c("a", "b"), 100, replace = TRUE), Metadata_type = sample(c("control", "trt"), 100, replace = TRUE), AreaShape_Area = c(rnorm(98), 20, 30), AreaShape_Eccentricity = rnorm(100) ) variables <- c("AreaShape_Area", "AreaShape_Eccentricity") sample <- population %>% dplyr::filter(Metadata_type == "control") population_marked <- cytominer::mark_outlier_rows( population, variables, sample, method = "svd+iqr" ) population_marked %>% dplyr::group_by(is_outlier) %>% dplyr::sample_n(3)
normalize
normalizes observation variables based on the specified normalization method.
normalize( population, variables, strata, sample, operation = "standardize", ... )
normalize( population, variables, strata, sample, operation = "standardize", ... )
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
strata |
character vector specifying grouping variables for grouping prior to normalization. |
sample |
tbl containing sample that is used by normalization methods to estimate parameters. |
operation |
optional character string specifying method for normalization. This must be one of the strings |
... |
arguments passed to normalization operation |
normalized data of the same class as population
.
suppressMessages(suppressWarnings(library(magrittr))) population <- tibble::tibble( Metadata_group = c( "control", "control", "control", "control", "experiment", "experiment", "experiment", "experiment" ), Metadata_batch = c("a", "a", "b", "b", "a", "a", "b", "b"), AreaShape_Area = c(10, 12, 15, 16, 8, 8, 7, 7) ) variables <- c("AreaShape_Area") strata <- c("Metadata_batch") sample <- population %>% dplyr::filter(Metadata_group == "control") cytominer::normalize(population, variables, strata, sample, operation = "standardize")
suppressMessages(suppressWarnings(library(magrittr))) population <- tibble::tibble( Metadata_group = c( "control", "control", "control", "control", "experiment", "experiment", "experiment", "experiment" ), Metadata_batch = c("a", "a", "b", "b", "a", "a", "b", "b"), AreaShape_Area = c(10, 12, 15, 16, 8, 8, 7, 7) ) variables <- c("AreaShape_Area") strata <- c("Metadata_batch") sample <- population %>% dplyr::filter(Metadata_group == "control") cytominer::normalize(population, variables, strata, sample, operation = "standardize")
'replicate_correlation' measures replicate correlation of variables.
replicate_correlation( sample, variables, strata, replicates, replicate_by = NULL, split_by = NULL, cores = NULL )
replicate_correlation( sample, variables, strata, replicates, replicate_by = NULL, split_by = NULL, cores = NULL )
sample |
tbl containing sample used to estimate parameters. |
variables |
character vector specifying observation variables. |
strata |
character vector specifying grouping variables for grouping prior to normalization. |
replicates |
number of replicates. |
replicate_by |
optional character string specifying column containing the replicate id. |
split_by |
optional character string specifying column by which to split the sample into batches; replicate correlations will be calculate per batch. |
cores |
optional integer specifying number of CPU cores used for
parallel computing using |
data frame of variable quality measurements
set.seed(123) x1 <- rnorm(10) x2 <- x1 + rnorm(10) / 100 y1 <- rnorm(10) y2 <- y1 + rnorm(10) / 10 z1 <- rnorm(10) z2 <- z1 + rnorm(10) / 1 batch <- rep(rep(1:2, each = 5), 2) treatment <- rep(1:10, 2) replicate_id <- rep(1:2, each = 10) sample <- tibble::tibble( x = c(x1, x2), y = c(y1, y2), z = c(z1, z2), Metadata_treatment = treatment, Metadata_replicate_id = replicate_id, Metadata_batch = batch ) head(sample) # `replicate_correlation`` returns the median, min, and max # replicate correlation (across batches) per variable replicate_correlation( sample = sample, variables = c("x", "y", "z"), strata = c("Metadata_treatment"), replicates = 2, split_by = "Metadata_batch", replicate_by = "Metadata_replicate_id", cores = 1 )
set.seed(123) x1 <- rnorm(10) x2 <- x1 + rnorm(10) / 100 y1 <- rnorm(10) y2 <- y1 + rnorm(10) / 10 z1 <- rnorm(10) z2 <- z1 + rnorm(10) / 1 batch <- rep(rep(1:2, each = 5), 2) treatment <- rep(1:10, 2) replicate_id <- rep(1:2, each = 10) sample <- tibble::tibble( x = c(x1, x2), y = c(y1, y2), z = c(z1, z2), Metadata_treatment = treatment, Metadata_replicate_id = replicate_id, Metadata_batch = batch ) head(sample) # `replicate_correlation`` returns the median, min, and max # replicate correlation (across batches) per variable replicate_correlation( sample = sample, variables = c("x", "y", "z"), strata = c("Metadata_treatment"), replicates = 2, split_by = "Metadata_batch", replicate_by = "Metadata_replicate_id", cores = 1 )
sparse_random_projection
reduces the dimensionality of a population
by projecting the original data with a sparse random matrix. Generally more
efficient and faster to compute than a Gaussian random projection matrix,
while providing similar embedding quality.
sparse_random_projection(population, variables, n_components)
sparse_random_projection(population, variables, n_components)
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
n_components |
size of the projected feature space. |
Dimensionality reduced population
.
population <- tibble::tibble( Metadata_Well = c("A01", "A02", "B01", "B02"), AreaShape_Area_DNA = c(10, 12, 7, 7), AreaShape_Length_DNA = c(2, 3, 1, 5), Intensity_DNA = c(8, 20, 12, 32), Texture_DNA = c(5, 2, 43, 13) ) variables <- c("AreaShape_Area_DNA", "AreaShape_Length_DNA", "Intensity_DNA", "Texture_DNA") sparse_random_projection(population, variables, 2)
population <- tibble::tibble( Metadata_Well = c("A01", "A02", "B01", "B02"), AreaShape_Area_DNA = c(10, 12, 7, 7), AreaShape_Length_DNA = c(2, 3, 1, 5), Intensity_DNA = c(8, 20, 12, 32), Texture_DNA = c(5, 2, 43, 13) ) variables <- c("AreaShape_Area_DNA", "AreaShape_Length_DNA", "Intensity_DNA", "Texture_DNA") sparse_random_projection(population, variables, 2)
spherize
transforms specified observation variables by estimating a
sphering transformation on a sample and applying it to the population.
spherize(population, variables, sample, regularization_param = 1)
spherize(population, variables, sample, regularization_param = 1)
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
sample |
tbl containing sample that is used by the method to estimate
sphering parameters. |
regularization_param |
optional parameter used in sphering to offset eigenvalues to avoid division by zero. |
transformed data of the same class as population
.
population <- tibble::tibble( Metadata_Well = c("A01", "A02", "B01", "B02"), Intensity_DNA = c(8, 20, 12, 32), Texture_DNA = c(5, 2, 43, 13) ) variables <- c("Intensity_DNA", "Texture_DNA") spherize(population, variables, population, 0.01)
population <- tibble::tibble( Metadata_Well = c("A01", "A02", "B01", "B02"), Intensity_DNA = c(8, 20, 12, 32), Texture_DNA = c(5, 2, 43, 13) ) variables <- c("Intensity_DNA", "Texture_DNA") spherize(population, variables, population, 0.01)
stratify
stratifies operations.
stratify(population, sample, reducer, strata, ...)
stratify(population, sample, reducer, strata, ...)
population |
tbl with grouping (metadata) and observation variables. |
sample |
tbl with the same structure as |
reducer |
operation that is to applied in a stratified manner. |
strata |
optional character vector specifying grouping variables for stratification. |
... |
arguments passed to operation. |
population
with potentially extra columns.
suppressMessages(suppressWarnings(library(magrittr))) population <- tibble::tibble( Metadata_group = sample(c("a", "b"), 100, replace = TRUE), Metadata_type = sample(c("control", "trt"), 100, replace = TRUE), AreaShape_Area = c(rnorm(98), 20, 30), AreaShape_Eccentricity = rnorm(100) ) variables <- c("AreaShape_Area", "AreaShape_Eccentricity") strata <- c("Metadata_group") sample <- population %>% dplyr::filter(Metadata_type == "control") population_marked <- cytominer::stratify( reducer = cytominer::mark_outlier_rows, method = "svd+iqr", population = population, variables = variables, sample = sample, strata = strata ) population_marked %>% dplyr::group_by(is_outlier) %>% dplyr::sample_n(3)
suppressMessages(suppressWarnings(library(magrittr))) population <- tibble::tibble( Metadata_group = sample(c("a", "b"), 100, replace = TRUE), Metadata_type = sample(c("control", "trt"), 100, replace = TRUE), AreaShape_Area = c(rnorm(98), 20, 30), AreaShape_Eccentricity = rnorm(100) ) variables <- c("AreaShape_Area", "AreaShape_Eccentricity") strata <- c("Metadata_group") sample <- population %>% dplyr::filter(Metadata_type == "control") population_marked <- cytominer::stratify( reducer = cytominer::mark_outlier_rows, method = "svd+iqr", population = population, variables = variables, sample = sample, strata = strata ) population_marked %>% dplyr::group_by(is_outlier) %>% dplyr::sample_n(3)
svd_entropy
measures the contribution of each feature in decreasing
the data entropy.
svd_entropy(sample, variables, cores = NULL)
svd_entropy(sample, variables, cores = NULL)
sample |
tbl containing sample used to estimate parameters. |
variables |
character vector specifying observation variables. |
cores |
optional integer specifying number of CPU cores used for
parallel computing using |
data frame specifying the contribution of each feature in decreasing the data entropy. Higher values indicate more information.
sample <- tibble::tibble( AreaShape_MinorAxisLength = c(10, 12, 15, 16, 8, 8, 7, 7, 13, 18), AreaShape_MajorAxisLength = c(35, 18, 22, 16, 9, 20, 11, 15, 18, 42), AreaShape_Area = c(245, 151, 231, 179, 50, 112, 53, 73, 164, 529) ) variables <- c("AreaShape_MinorAxisLength", "AreaShape_MajorAxisLength", "AreaShape_Area") svd_entropy(sample, variables, cores = 1)
sample <- tibble::tibble( AreaShape_MinorAxisLength = c(10, 12, 15, 16, 8, 8, 7, 7, 13, 18), AreaShape_MajorAxisLength = c(35, 18, 22, 16, 9, 20, 11, 15, 18, 42), AreaShape_Area = c(245, 151, 231, 179, 50, 112, 53, 73, 164, 529) ) variables <- c("AreaShape_MinorAxisLength", "AreaShape_MajorAxisLength", "AreaShape_Area") svd_entropy(sample, variables, cores = 1)
transform
transforms observation variables based on the specified
transformation method.
transform(population, variables, operation = "generalized_log", ...)
transform(population, variables, operation = "generalized_log", ...)
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
operation |
optional character string specifying method for transform.
This must be one of the strings |
... |
arguments passed to transformation operation. |
transformed data of the same class as population
.
population <- tibble::tibble( Metadata_Well = c("A01", "A02", "B01", "B02"), Intensity_DNA = c(8, 20, 12, 32), Intensity_RNA = c(1, 12, -1, 4), Intensity_AGP = c(-2, 5, -5, -2), Intensity_Mito = c(-1, 15, 5, 22), Intensity_ER = c(-12, 15, -25, 24) ) variables <- c("Intensity_DNA", "Intensity_RNA", "Intensity_AGP", "Intensity_ER") transform(population, variables, operation = "generalized_log") transform(population, variables, sample = population, operation = "husk", remove_outliers = FALSE) transform(population, variables, sample = population, operation = "spherize") transform(population, variables, n_components = 2, operation = "sparse_random_projection")
population <- tibble::tibble( Metadata_Well = c("A01", "A02", "B01", "B02"), Intensity_DNA = c(8, 20, 12, 32), Intensity_RNA = c(1, 12, -1, 4), Intensity_AGP = c(-2, 5, -5, -2), Intensity_Mito = c(-1, 15, 5, 22), Intensity_ER = c(-12, 15, -25, 24) ) variables <- c("Intensity_DNA", "Intensity_RNA", "Intensity_AGP", "Intensity_ER") transform(population, variables, operation = "generalized_log") transform(population, variables, sample = population, operation = "husk", remove_outliers = FALSE) transform(population, variables, sample = population, operation = "spherize") transform(population, variables, n_components = 2, operation = "sparse_random_projection")
variable_importance
measures importance of variables based on
specified methods.
variable_importance( sample, variables, operation = "replicate_correlation", ... )
variable_importance( sample, variables, operation = "replicate_correlation", ... )
sample |
tbl containing sample used to estimate parameters. |
variables |
character vector specifying observation variables. |
operation |
optional character string specifying method for computing
variable importance. This must be one of the strings
|
... |
arguments passed to variable importance operation. |
data frame containing variable importance measures.
set.seed(123) x1 <- rnorm(10) x2 <- x1 + rnorm(10) / 100 y1 <- rnorm(10) y2 <- y1 + rnorm(10) / 10 z1 <- rnorm(10) z2 <- z1 + rnorm(10) / 1 batch <- rep(rep(1:2, each = 5), 2) treatment <- rep(1:10, 2) replicate_id <- rep(1:2, each = 10) sample <- tibble::tibble( x = c(x1, x2), y = c(y1, y2), z = c(z1, z2), Metadata_treatment = treatment, Metadata_replicate_id = replicate_id, Metadata_batch = batch ) head(sample) # `replicate_correlation`` returns the median, min, and max # replicate correlation (across batches) per variable variable_importance( sample = sample, variables = c("x", "y", "z"), operation = "replicate_correlation", strata = c("Metadata_treatment"), replicates = 2, split_by = "Metadata_batch", replicate_by = "Metadata_replicate_id", cores = 1 ) # `svd_entropy`` measures the contribution of each variable in decreasing # the data entropy. variable_importance( sample = sample, variables = c("x", "y", "z"), operation = "svd_entropy", cores = 1 )
set.seed(123) x1 <- rnorm(10) x2 <- x1 + rnorm(10) / 100 y1 <- rnorm(10) y2 <- y1 + rnorm(10) / 10 z1 <- rnorm(10) z2 <- z1 + rnorm(10) / 1 batch <- rep(rep(1:2, each = 5), 2) treatment <- rep(1:10, 2) replicate_id <- rep(1:2, each = 10) sample <- tibble::tibble( x = c(x1, x2), y = c(y1, y2), z = c(z1, z2), Metadata_treatment = treatment, Metadata_replicate_id = replicate_id, Metadata_batch = batch ) head(sample) # `replicate_correlation`` returns the median, min, and max # replicate correlation (across batches) per variable variable_importance( sample = sample, variables = c("x", "y", "z"), operation = "replicate_correlation", strata = c("Metadata_treatment"), replicates = 2, split_by = "Metadata_batch", replicate_by = "Metadata_replicate_id", cores = 1 ) # `svd_entropy`` measures the contribution of each variable in decreasing # the data entropy. variable_importance( sample = sample, variables = c("x", "y", "z"), operation = "svd_entropy", cores = 1 )
variable_select
selects observation variables based on the specified
variable selection method.
variable_select( population, variables, sample = NULL, operation = "variance_threshold", ... )
variable_select( population, variables, sample = NULL, operation = "variance_threshold", ... )
population |
tbl with grouping (metadata) and observation variables. |
variables |
character vector specifying observation variables. |
sample |
tbl containing sample that is used by some variable selection
methods. |
operation |
optional character string specifying method for variable
selection. This must be one of the strings |
... |
arguments passed to selection operation. |
variable-selected data of the same class as population
.
# In this example, we use `correlation_threshold` as the operation for # variable selection. suppressMessages(suppressWarnings(library(magrittr))) population <- tibble::tibble( x = rnorm(100), y = rnorm(100) / 1000 ) population %<>% dplyr::mutate(z = x + rnorm(100) / 10) sample <- population %>% dplyr::slice(1:30) variables <- c("x", "y", "z") operation <- "correlation_threshold" cor(sample) # `x` and `z` are highly correlated; one of them will be removed head(population) futile.logger::flog.threshold(futile.logger::ERROR) variable_select(population, variables, sample, operation) %>% head()
# In this example, we use `correlation_threshold` as the operation for # variable selection. suppressMessages(suppressWarnings(library(magrittr))) population <- tibble::tibble( x = rnorm(100), y = rnorm(100) / 1000 ) population %<>% dplyr::mutate(z = x + rnorm(100) / 10) sample <- population %>% dplyr::slice(1:30) variables <- c("x", "y", "z") operation <- "correlation_threshold" cor(sample) # `x` and `z` are highly correlated; one of them will be removed head(population) futile.logger::flog.threshold(futile.logger::ERROR) variable_select(population, variables, sample, operation) %>% head()
variance_threshold
returns list of variables that have near-zero
variance.
variance_threshold(variables, sample)
variance_threshold(variables, sample)
variables |
character vector specifying observation variables. |
sample |
tbl containing sample used to estimate parameters. |
variance_threshold
is a reimplementation of caret::nearZeroVar
,
using the default values for freqCut
and uniqueCut
.
character vector specifying observation variables to be excluded.
sample <- tibble::tibble( AreaShape_Area = c(10, 12, 15, 16, 8, 8, 7, 7, 13, 18), AreaShape_Euler = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0) ) variables <- c("AreaShape_Area", "AreaShape_Euler") variance_threshold(variables, sample)
sample <- tibble::tibble( AreaShape_Area = c(10, 12, 15, 16, 8, 8, 7, 7, 13, 18), AreaShape_Euler = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0) ) variables <- c("AreaShape_Area", "AreaShape_Euler") variance_threshold(variables, sample)