![]() |
|
Go to the source code of this file.
Data Structures | |
struct | apop_name |
struct | apop_data |
struct | apop_settings_type |
struct | apop_model |
struct | apop_opts_type |
struct | apop_mle_settings |
struct | apop_lm_settings |
struct | apop_parts_wanted_settings |
struct | apop_cdf_settings |
struct | apop_pm_settings |
struct | apop_pmf_settings |
struct | apop_kernel_density_settings |
struct | apop_mcmc_proposal_s |
struct | apop_mcmc_settings |
struct | loess_struct |
struct | apop_loess_settings |
struct | apop_arms_settings |
struct | apop_stack_settings |
struct | apop_ct_settings |
struct | apop_dconstrain_settings |
struct | apop_composition_settings |
struct | apop_mixture_settings |
Macros | |
#define | _GNU_SOURCE |
#define | apop_varad_head(type, name) type variadic_##name(variadic_type_##name varad_in) |
#define | apop_varad_declare(type, name,...) |
#define | apop_varad_var(name, value) name = varad_in.name ? varad_in.name : (value); |
#define | apop_varad_link(name,...) variadic_##name((variadic_type_##name) {__VA_ARGS__}) |
#define | apop_data_add_names(dataset, type,...) apop_data_add_names_base((dataset), (type), (char const*[]) {__VA_ARGS__, NULL}) |
#define | apop_data_free(freeme) (apop_data_free_base(freeme) ? 0 : ((freeme)= NULL)) |
#define | apop_data_prune_columns(in,...) |
#define | apop_line_to_vector apop_array_to_vector |
#define | apop_vector_fill(avfin,...) apop_vector_fill_base((avfin), (double []) {__VA_ARGS__}) |
#define | apop_data_fill(adfin,...) apop_data_fill_base((adfin), (double []) {__VA_ARGS__}) |
#define | apop_text_fill(dataset,...) apop_text_fill_base((dataset), (char* []) {__VA_ARGS__, NULL}) |
#define | apop_data_falloc(sizes,...) apop_data_fill(apop_data_alloc sizes, __VA_ARGS__) |
#define | apop_gaussian |
#define | apop_OLS apop_ols |
#define | apop_PMF apop_pmf |
#define | apop_F_distribution apop_f_distribution |
#define | apop_WLS apop_wls |
#define | apop_IV apop_iv |
#define | apop_model_set_parameters(in,...) |
#define | apop_model_mixture(...) apop_model_mixture_base((apop_model *[]){__VA_ARGS__, NULL}) |
#define | apop_model_stack(...) apop_model_stack_base((apop_model *[]){__VA_ARGS__, NULL}) |
#define | apop_ANOVA apop_anova |
#define | apop_F_test apop_f_test |
#define | apop_estimate_r_squared(in) |
#define | apop_rng_get_thread(thread_in) |
#define | apop_update_hash(m1, m2) |
#define | apop_update_vtable_add(fn,...) apop_update_type_check(fn), apop_vtable_add("apop_update", fn, apop_update_hash(__VA_ARGS__)) |
#define | apop_update_vtable_get(...) apop_vtable_get("apop_update", apop_update_hash(__VA_ARGS__)) |
#define | apop_update_vtable_drop(...) apop_vtable_drop("apop_update", apop_update_hash(__VA_ARGS__)) |
#define | apop_score_hash(m1) ((size_t)((m1)->log_likelihood ? (m1)->log_likelihood : (m1)->p)) |
#define | apop_score_vtable_add(fn,...) apop_score_type_check(fn), apop_vtable_add("apop_score", fn, apop_score_hash(__VA_ARGS__)) |
#define | apop_score_vtable_get(...) apop_vtable_get("apop_score", apop_score_hash(__VA_ARGS__)) |
#define | apop_score_vtable_drop(...) apop_vtable_drop("apop_score", apop_score_hash(__VA_ARGS__)) |
#define | apop_parameter_model_hash(m1) ((size_t)((m1)->log_likelihood ? (m1)->log_likelihood : (m1)->p)*33 + (m1)->estimate ? (size_t)(m1)->estimate: 27) |
#define | apop_parameter_model_vtable_add(fn,...) apop_parameter_model_type_check(fn), apop_vtable_add("apop_parameter_model", fn, apop_parameter_model_hash(__VA_ARGS__)) |
#define | apop_parameter_model_vtable_get(...) apop_vtable_get("apop_parameter_model", apop_parameter_model_hash(__VA_ARGS__)) |
#define | apop_parameter_model_vtable_drop(...) apop_vtable_drop("apop_parameter_model", apop_parameter_model_hash(__VA_ARGS__)) |
#define | apop_predict_hash(m1) ((size_t)((m1)->log_likelihood ? (m1)->log_likelihood : (m1)->p)*33 + (m1)->estimate ? (size_t)(m1)->estimate: 27) |
#define | apop_predict_vtable_add(fn,...) apop_predict_type_check(fn), apop_vtable_add("apop_predict", fn, apop_predict_hash(__VA_ARGS__)) |
#define | apop_predict_vtable_get(...) apop_vtable_get("apop_predict", apop_predict_hash(__VA_ARGS__)) |
#define | apop_predict_vtable_drop(...) apop_vtable_drop("apop_predict", apop_predict_hash(__VA_ARGS__)) |
#define | apop_model_print_hash(m1) |
#define | apop_model_print_vtable_add(fn,...) apop_model_print_type_check(fn), apop_vtable_add("apop_model_print", fn, apop_model_print_hash(__VA_ARGS__)) |
#define | apop_model_print_vtable_get(...) apop_vtable_get("apop_model_print", apop_model_print_hash(__VA_ARGS__)) |
#define | apop_model_print_vtable_drop(...) apop_vtable_drop("apop_model_print", apop_model_print_hash(__VA_ARGS__)) |
#define | apop_test_ANOVA_independence(d) apop_test_anova_independence(d) |
#define | Apop_notify(verbosity,...) |
#define | Apop_maybe_abort(level) |
#define | Apop_stopif(test, onfail, level,...) |
#define | apop_errorlevel -5 |
#define | apop_return_data_error(E) {apop_data *out=apop_data_alloc(); out->error='E'; return out;} |
#define | Apop_assert_c(test, returnval, level,...) Apop_stopif(!(test), return returnval, level, __VA_ARGS__) |
#define | Apop_assert(test,...) Apop_assert_c((test), 0, apop_errorlevel, __VA_ARGS__) |
#define | Apop_assert_n(test,...) Apop_assert_c((test), , apop_errorlevel, __VA_ARGS__) |
#define | Apop_assert_nan(test,...) Apop_assert_c((test), GSL_NAN, apop_errorlevel, __VA_ARGS__) |
#define | Apop_assert_negone(test,...) Apop_assert_c((test), -1, apop_errorlevel, __VA_ARGS__) |
#define | apop_ml_imputation(d, m) apop_ml_impute(d, m) |
#define | APOP_SUBMATRIX(m, srow, scol, nrows, ncols, o) |
#define | Apop_row_v(m, row, v) Apop_matrix_row((m)->matrix, row, v) |
#define | Apop_col_v(m, col, v) |
#define | Apop_rows(d, rownum, len, outd) |
#define | Apop_row(d, row, outd) |
#define | Apop_cols(d, colnum, len, outd) |
#define | Apop_row_tv(m, row, v) |
#define | Apop_col_tv(m, col, v) |
#define | Apop_row_t(d, rowname, outd) |
#define | Apop_col_t(d, colname, outd) |
#define | Apop_subm(data_to_view, srow, scol, nrows, ncols) |
#define | Apop_rv(data_to_view, row) |
#define | Apop_cv(data_to_view, col) |
#define | apop_subvector(v, start, len) |
#define | apop_mrow(m, row) |
#define | Apop_rs(d, rownum, len) |
#define | Apop_cs(d, colnum, len) |
#define | Apop_r(d, rownum) |
#define | Apop_c(d, col) |
#define | APOP_COL Apop_col |
#define | apop_col Apop_col |
#define | APOP_COLS Apop_cols |
#define | apop_cols Apop_cols |
#define | APOP_COL_T Apop_col_t |
#define | apop_col_t Apop_col_t |
#define | APOP_COL_TV Apop_col_tv |
#define | apop_col_tv Apop_col_tv |
#define | APOP_COL_V Apop_col_v |
#define | apop_col_v Apop_col_v |
#define | APOP_ROW Apop_row |
#define | apop_row Apop_row |
#define | Apop_data_row Apop_row #deprecated |
#define | APOP_ROWS Apop_rows |
#define | apop_rows Apop_rows |
#define | APOP_ROW_T Apop_row_t |
#define | apop_row_t Apop_row_t |
#define | APOP_ROW_TV Apop_row_tv |
#define | apop_row_tv Apop_row_tv |
#define | APOP_ROW_V Apop_row_v |
#define | apop_row_v Apop_row_v |
#define | Apop_matrix_row(m, row, v) |
#define | Apop_matrix_col(m, col, v) |
#define | Apop_submatrix APOP_SUBMATRIX |
#define | APOP_MATRIX_ROW Apop_matrix_row |
#define | apop_matrix_row Apop_matrix_row |
#define | APOP_MATRIX_COL Apop_matrix_col |
#define | apop_matrix_col Apop_matrix_col |
#define | apop_sum(in) apop_vector_sum(in) |
#define | apop_var(in) apop_vector_var(in) |
#define | apop_mean(in) |
#define | Apop_settings_get_group(m, type) |
#define | Apop_settings_rm_group(m, type) |
#define | Apop_settings_add_group(model, type,...) |
#define | apop_model_copy_set(model, type,...) |
#define | Apop_settings_get(model, type, setting) |
#define | Apop_settings_set(model, type, setting, data) |
#define | Apop_settings_declarations(ysg) |
#define | Apop_settings_init(name,...) |
#define | Apop_varad_set(var, value) (out)->var = (in).var ? (in).var : (value); |
#define | Apop_settings_copy(name,...) |
#define | Apop_settings_free(name,...) |
#define | apop_model_coordinate_transform(...) Apop_model_copy_set(apop_coordinate_transform, apop_ct, __VA_ARGS__) |
#define | apop_model_dcompose(...) Apop_model_copy_set(apop_composition, apop_composition, __VA_ARGS__) |
#define | apop_model_dconstrain(...) Apop_model_copy_set(apop_dconstrain, apop_dconstrain, __VA_ARGS__) |
Typedefs | |
typedef struct apop_data | apop_data |
typedef struct apop_model | apop_model |
typedef apop_model *(* | apop_update_type )(apop_data *, apop_model *, apop_model *) |
typedef void(* | apop_score_type )(apop_data *d, gsl_vector *gradient, apop_model *params) |
typedef apop_model *(* | apop_parameter_model_type )(apop_data *, apop_model *) |
typedef apop_data *(* | apop_predict_type )(apop_data *d, apop_model *params) |
typedef void(* | apop_model_print_type )(apop_model *params, FILE *out) |
typedef struct apop_mcmc_proposal_s | apop_mcmc_proposal_s |
typedef struct apop_mcmc_settings | apop_mcmc_settings |
Functions | |
apop_name * | apop_name_alloc (void) |
int | apop_name_add (apop_name *n, char const *add_me, char type) |
void | apop_name_free (apop_name *free_me) |
void | apop_name_print (apop_name *n) |
void | apop_name_stack (apop_name *n1, apop_name *nadd, char type1, char typeadd) |
apop_name * | apop_name_copy (apop_name *in) |
int | apop_name_find (const apop_name *n, const char *findme, const char type) |
void | apop_data_add_names_base (apop_data *d, const char type, char const **names) |
char | apop_data_free_base (apop_data *freeme) |
apop_data * | apop_matrix_to_data (gsl_matrix *m) |
apop_data * | apop_vector_to_data (gsl_vector *v) |
apop_data * | apop_data_alloc (const size_t size1, const size_t size2, const int size3) |
apop_data * | apop_data_calloc (const size_t size1, const size_t size2, const int size3) |
apop_data * | apop_data_stack (apop_data *m1, apop_data *m2, char posn, char inplace) |
apop_data ** | apop_data_split (apop_data *in, int splitpoint, char r_or_c) |
apop_data * | apop_data_copy (const apop_data *in) |
void | apop_data_rm_columns (apop_data *d, int *drop) |
void | apop_data_memcpy (apop_data *out, const apop_data *in) |
double * | apop_data_ptr (apop_data *data, int row, int col, const char *rowname, const char *colname, const char *page) |
double | apop_data_get (const apop_data *data, size_t row, int col, const char *rowname, const char *colname, const char *page) |
int | apop_data_set (apop_data *data, size_t row, int col, const double val, const char *rowname, const char *colname, const char *page) |
void | apop_data_add_named_elmt (apop_data *d, char *name, double val) |
int | apop_text_add (apop_data *in, const size_t row, const size_t col, const char *fmt,...) |
apop_data * | apop_text_alloc (apop_data *in, const size_t row, const size_t col) |
void | apop_text_free (char ***freeme, int rows, int cols) |
apop_data * | apop_data_transpose (apop_data *in, char transpose_text, char inplace) |
gsl_matrix * | apop_matrix_realloc (gsl_matrix *m, size_t newheight, size_t newwidth) |
gsl_vector * | apop_vector_realloc (gsl_vector *v, size_t newheight) |
apop_data * | apop_data_prune_columns_base (apop_data *d, char **colnames) |
apop_data * | apop_data_get_page (const apop_data *data, const char *title, const char match) |
apop_data * | apop_data_add_page (apop_data *dataset, apop_data *newpage, const char *title) |
apop_data * | apop_data_rm_page (apop_data *data, const char *title, const char free_p) |
apop_data * | apop_data_rm_rows (apop_data *in, int *drop, int(*do_drop)(apop_data *, void *), void *drop_parameter) |
apop_data * | apop_model_draws (apop_model *model, int count, apop_data *draws) |
gsl_vector * | apop_vector_copy (const gsl_vector *in) |
gsl_matrix * | apop_vector_to_matrix (const gsl_vector *in, char row_col) |
gsl_matrix * | apop_matrix_copy (const gsl_matrix *in) |
apop_data * | apop_db_to_crosstab (char *tabname, char *r1, char *r2, char *datacol) |
gsl_vector * | apop_array_to_vector (double *in, int size) |
apop_data * | apop_text_to_data (char const *text_file, int has_row_names, int has_col_names, int const *field_ends, char const *delimiters) |
int | apop_text_to_db (char const *text_file, char *tabname, int has_row_names, int has_col_names, char **field_names, int const *field_ends, apop_data *field_params, char *table_params, char const *delimiters, char if_table_exists) |
apop_data * | apop_data_rank_expand (apop_data *in) |
apop_data * | apop_data_rank_compress (apop_data *in) |
void | apop_crosstab_to_db (apop_data *in, char *tabname, char *row_col_name, char *col_col_name, char *data_col_name) |
gsl_vector * | apop_data_pack (const apop_data *in, gsl_vector *out, char all_pages, char use_info_pages) |
void | apop_data_unpack (const gsl_vector *in, apop_data *d, char use_info_pages) |
apop_data * | apop_data_fill_base (apop_data *in, double[]) |
gsl_vector * | apop_vector_fill_base (gsl_vector *in, double[]) |
apop_data * | apop_text_fill_base (apop_data *data, char *text[]) |
int | apop_data_set_row (apop_data *row, apop_data *d, int row_number) |
void | apop_model_free (apop_model *free_me) |
void | apop_model_print (apop_model *print_me, FILE *out) |
void | apop_model_show (apop_model *print_me) |
apop_model * | apop_model_copy (apop_model *in) |
apop_model * | apop_model_clear (apop_data *data, apop_model *model) |
apop_model * | apop_estimate (apop_data *d, apop_model *m) |
void | apop_score (apop_data *d, gsl_vector *out, apop_model *m) |
double | apop_log_likelihood (apop_data *d, apop_model *m) |
double | apop_p (apop_data *d, apop_model *m) |
double | apop_cdf (apop_data *d, apop_model *m) |
int | apop_draw (double *out, gsl_rng *r, apop_model *m) |
void | apop_prep (apop_data *d, apop_model *m) |
apop_model * | apop_parameter_model (apop_data *d, apop_model *m) |
apop_data * | apop_predict (apop_data *d, apop_model *m) |
apop_model * | apop_beta_from_mean_var (double m, double v) |
apop_model * | apop_model_set_parameters_base (apop_model *in, double ap[]) |
apop_model * | apop_model_mixture_base (apop_model **inlist) |
apop_model * | apop_model_stack_base (apop_model *mlist[]) |
apop_data * | apop_map (apop_data *in, double(*fn_d)(double), double(*fn_v)(gsl_vector *), double(*fn_r)(apop_data *), double(*fn_dp)(double, void *), double(*fn_vp)(gsl_vector *, void *), double(*fn_rp)(apop_data *, void *), double(*fn_dpi)(double, void *, int), double(*fn_vpi)(gsl_vector *, void *, int), double(*fn_rpi)(apop_data *, void *, int), double(*fn_di)(double, int), double(*fn_vi)(gsl_vector *, int), double(*fn_ri)(apop_data *, int), void *param, int inplace, char part, int all_pages) |
double | apop_map_sum (apop_data *in, double(*fn_d)(double), double(*fn_v)(gsl_vector *), double(*fn_r)(apop_data *), double(*fn_dp)(double, void *), double(*fn_vp)(gsl_vector *, void *), double(*fn_rp)(apop_data *, void *), double(*fn_dpi)(double, void *, int), double(*fn_vpi)(gsl_vector *, void *, int), double(*fn_rpi)(apop_data *, void *, int), double(*fn_di)(double, int), double(*fn_vi)(gsl_vector *, int), double(*fn_ri)(apop_data *, int), void *param, char part, int all_pages) |
gsl_vector * | apop_matrix_map (const gsl_matrix *m, double(*fn)(gsl_vector *)) |
gsl_vector * | apop_vector_map (const gsl_vector *v, double(*fn)(double)) |
void | apop_matrix_apply (gsl_matrix *m, void(*fn)(gsl_vector *)) |
void | apop_vector_apply (gsl_vector *v, void(*fn)(double *)) |
gsl_matrix * | apop_matrix_map_all (const gsl_matrix *in, double(*fn)(double)) |
void | apop_matrix_apply_all (gsl_matrix *in, void(*fn)(double *)) |
double | apop_vector_map_sum (const gsl_vector *in, double(*fn)(double)) |
double | apop_matrix_map_sum (const gsl_matrix *in, double(*fn)(gsl_vector *)) |
double | apop_matrix_map_all_sum (const gsl_matrix *in, double(*fn)(double)) |
void | apop_plot_histogram (gsl_vector *data, size_t bin_count, char *with, char const *output_name, FILE *output_pipe, char output_type, char output_append) |
void | apop_matrix_print (const gsl_matrix *data, char const *output_name, FILE *output_pipe, char output_type, char output_append) |
void | apop_vector_print (gsl_vector *data, char const *output_name, FILE *output_pipe, char output_type, char output_append) |
void | apop_data_print (const apop_data *data, char const *output_name, FILE *output_pipe, char output_type, char output_append) |
void | apop_matrix_show (const gsl_matrix *data) |
void | apop_vector_show (const gsl_vector *data) |
void | apop_data_show (const apop_data *data) |
double | apop_vector_mean (gsl_vector const *v, gsl_vector const *weights) |
double | apop_vector_var (gsl_vector const *v, gsl_vector const *weights) |
double | apop_vector_skew_pop (gsl_vector const *v, gsl_vector const *weights) |
double | apop_vector_kurtosis_pop (gsl_vector const *v, gsl_vector const *weights) |
double | apop_vector_cov (gsl_vector const *v1, gsl_vector const *v2, gsl_vector const *weights) |
double | apop_vector_distance (const gsl_vector *ina, const gsl_vector *inb, const char metric, const double norm) |
void | apop_vector_normalize (gsl_vector *in, gsl_vector **out, const char normalization_type) |
void | apop_matrix_normalize (gsl_matrix *data, const char row_or_col, const char normalization) |
apop_data * | apop_data_covariance (const apop_data *in) |
apop_data * | apop_data_correlation (const apop_data *in) |
long double | apop_matrix_sum (const gsl_matrix *m) |
double | apop_matrix_mean (const gsl_matrix *data) |
void | apop_matrix_mean_and_var (const gsl_matrix *data, double *mean, double *var) |
apop_data * | apop_data_summarize (apop_data *data) |
apop_data * | apop_test_fisher_exact (apop_data *intab) |
int | apop_matrix_is_positive_semidefinite (gsl_matrix *m, char semi) |
double | apop_matrix_to_positive_semidefinite (gsl_matrix *m) |
long double | apop_multivariate_gamma (double a, int p) |
long double | apop_multivariate_lngamma (double a, int p) |
apop_data * | apop_t_test (gsl_vector *a, gsl_vector *b) |
apop_data * | apop_paired_t_test (gsl_vector *a, gsl_vector *b) |
apop_data * | apop_anova (char *table, char *data, char *grouping1, char *grouping2) |
apop_data * | apop_f_test (apop_model *est, apop_data *contrast) |
apop_data * | apop_text_unique_elements (const apop_data *d, size_t col) |
gsl_vector * | apop_vector_unique_elements (const gsl_vector *v) |
apop_data * | apop_data_to_factors (apop_data *data, char intype, int incol, int outcol) |
apop_data * | apop_data_get_factor_names (apop_data *data, int col, char type) |
apop_data * | apop_data_to_dummies (apop_data *d, int col, char type, int keep_first, char append, char remove) |
double | apop_kl_divergence (apop_model *from, apop_model *to, int draw_ct, gsl_rng *rng) |
apop_data * | apop_estimate_coefficient_of_determination (apop_model *) |
void | apop_estimate_parameter_tests (apop_model *est) |
apop_data * | apop_jackknife_cov (apop_data *data, apop_model *model) |
apop_data * | apop_bootstrap_cov (apop_data *data, apop_model *model, gsl_rng *rng, int iterations, char keep_boots, char ignore_nans) |
gsl_rng * | apop_rng_alloc (int seed) |
double | apop_rng_GHgB3 (gsl_rng *r, double *a) |
gsl_rng * | apop_rng_get_thread_base (int thread) |
int | apop_arms_draw (double *out, gsl_rng *r, apop_model *m) |
Adaptive rejection metropolis sampling. More... | |
gsl_vector * | apop_numerical_gradient (apop_data *data, apop_model *model, double delta) |
apop_data * | apop_model_hessian (apop_data *data, apop_model *model, double delta) |
apop_data * | apop_model_numerical_covariance (apop_data *data, apop_model *model, double delta) |
void | apop_maximum_likelihood (apop_data *data, apop_model *dist) |
apop_model * | apop_estimate_restart (apop_model *e, apop_model *copy, char *starting_pt, double boundary) |
long double | apop_linear_constraint (gsl_vector *beta, apop_data *constraint, double margin) |
apop_model * | apop_model_fix_params (apop_model *model_in) |
apop_model * | apop_model_fix_params_get_base (apop_model *model_in) |
int | apop_vtable_add (char const *tabname, void *fn_in, unsigned long hash) |
void * | apop_vtable_get (char const *tabname, unsigned long hash) |
int | apop_vtable_drop (char const *tabname, unsigned long hash) |
void | apop_update_type_check (apop_update_type in) |
void | apop_score_type_check (apop_score_type in) |
void | apop_parameter_model_type_check (apop_parameter_model_type in) |
void | apop_predict_type_check (apop_predict_type in) |
void | apop_model_print_type_check (apop_model_print_type in) |
double | apop_generalized_harmonic (int N, double s) __attribute__((__pure__)) |
apop_data * | apop_test_anova_independence (apop_data *d) |
int | apop_regex (const char *string, const char *regex, apop_data **substrings, const char use_case) |
int | apop_system (const char *fmt,...) __attribute__((format(printf |
int gsl_vector * | apop_vector_moving_average (gsl_vector *, size_t) |
apop_data * | apop_histograms_test_goodness_of_fit (apop_model *h0, apop_model *h1) |
apop_data * | apop_test_kolmogorov (apop_model *m1, apop_model *m2) |
apop_data * | apop_data_pmf_compress (apop_data *in) |
apop_data * | apop_data_to_bins (apop_data *indata, apop_data *binspec, int bin_count, char close_top_bin) |
apop_model * | apop_model_to_pmf (apop_model *model, apop_data *binspec, long int draws, int bin_count, gsl_rng *rng) |
char * | apop_text_paste (apop_data const *strings, char *between, char *before, char *after, char *between_cols, int(*prune)(apop_data *, int, int, void *), void *prune_parameter) |
apop_data * | apop_data_listwise_delete (apop_data *d, char inplace) |
apop_model * | apop_ml_impute (apop_data *d, apop_model *meanvar) |
apop_model * | apop_model_metropolis (apop_data *d, gsl_rng *rng, apop_model *m) |
apop_model * | apop_update (apop_data *data, apop_model *prior, apop_model *likelihood, gsl_rng *rng) |
double | apop_test (double statistic, char *distribution, double p1, double p2, char tail) |
double * | apop_vector_percentiles (gsl_vector *data, char rounding) |
apop_data * | apop_data_sort (apop_data *data, apop_data *sort_order, char asc, char inplace, double *col_order) |
apop_data * | apop_rake (char const *margin_table, char *const *var_list, int var_ct, char const *all_vars, char *const *contrasts, int contrast_ct, char const *structural_zeros, int max_iterations, double tolerance, char const *count_col, int run_number, char const *init_table, char const *init_count_col, double nudge, char const *table_name) |
double | apop_det_and_inv (const gsl_matrix *in, gsl_matrix **out, int calc_det, int calc_inv) |
apop_data * | apop_dot (const apop_data *d1, const apop_data *d2, char form1, char form2) |
int | apop_vector_bounded (const gsl_vector *in, long double max) |
gsl_matrix * | apop_matrix_inverse (const gsl_matrix *in) |
double | apop_matrix_determinant (const gsl_matrix *in) |
apop_data * | apop_matrix_pca (gsl_matrix *data, int const dimensions_we_want) |
gsl_vector * | apop_vector_stack (gsl_vector *v1, gsl_vector *v2, char inplace) |
gsl_matrix * | apop_matrix_stack (gsl_matrix *m1, gsl_matrix *m2, char posn, char inplace) |
gsl_matrix * | apop_matrix_rm_columns (gsl_matrix *in, int *drop) |
void | apop_vector_log (gsl_vector *v) |
void | apop_vector_log10 (gsl_vector *v) |
void | apop_vector_exp (gsl_vector *v) |
long double | apop_vector_sum (const gsl_vector *in) |
double | apop_vector_var_m (const gsl_vector *in, const double mean) |
double | apop_vector_correlation (const gsl_vector *ina, const gsl_vector *inb) |
double | apop_vector_kurtosis (const gsl_vector *in) |
double | apop_vector_skew (const gsl_vector *in) |
int | apop_table_exists (char const *name, char remove) |
int | apop_db_open (char const *filename) |
int | apop_db_close (char vacuum) |
int | apop_query (const char *q,...) __attribute__((format(printf |
int gsl_matrix * | apop_query_to_matrix (const char *fmt,...) __attribute__((format(printf |
int gsl_matrix apop_data * | apop_query_to_text (const char *fmt,...) __attribute__((format(printf |
int gsl_matrix apop_data apop_data * | apop_query_to_data (const char *fmt,...) __attribute__((format(printf |
int gsl_matrix apop_data apop_data apop_data * | apop_query_to_mixed_data (const char *typelist, const char *fmt,...) __attribute__((format(printf |
int gsl_matrix apop_data apop_data apop_data gsl_vector * | apop_query_to_vector (const char *fmt,...) __attribute__((format(printf |
int gsl_matrix apop_data apop_data apop_data gsl_vector double | apop_query_to_float (const char *fmt,...) __attribute__((format(printf |
int gsl_matrix apop_data apop_data apop_data gsl_vector double int | apop_data_to_db (const apop_data *set, const char *tabname, char) |
void * | apop_settings_get_grp (apop_model *m, char *type, char fail) |
void | apop_settings_remove_group (apop_model *m, char *delme) |
void | apop_settings_copy_group (apop_model *outm, apop_model *inm, char *copyme) |
void * | apop_settings_group_alloc (apop_model *model, char *type, void *free_fn, void *copy_fn, void *the_group) |
apop_model * | apop_settings_group_alloc_wm (apop_model *model, char *type, void *free_fn, void *copy_fn, void *the_group) |
Variables | |
apop_opts_type | apop_opts |
apop_model * | apop_beta |
apop_model * | apop_bernoulli |
apop_model * | apop_binomial |
apop_model * | apop_chi_squared |
apop_model * | apop_dirichlet |
apop_model * | apop_exponential |
apop_model * | apop_f_distribution |
apop_model * | apop_gamma |
apop_model * | apop_improper_uniform |
apop_model * | apop_iv |
apop_model * | apop_kernel_density |
apop_model * | apop_loess |
apop_model * | apop_logit |
apop_model * | apop_lognormal |
apop_model * | apop_multinomial |
apop_model * | apop_multivariate_normal |
apop_model * | apop_normal |
apop_model * | apop_ols |
apop_model * | apop_pmf |
apop_model * | apop_poisson |
apop_model * | apop_probit |
apop_model * | apop_t_distribution |
apop_model * | apop_uniform |
apop_model * | apop_wls |
apop_model * | apop_yule |
apop_model * | apop_zipf |
apop_model * | apop_coordinate_transform |
apop_model * | apop_composition |
apop_model * | apop_dconstrain |
apop_model * | apop_mixture |
apop_model * | apop_stack |
#define Apop_c | ( | d, | |
col | |||
) |
A macro to generate a temporary one-column view of apop_data set d
, pulling out only column col
. After this call, outd
will be a pointer to this temporary view, that you can use as you would any apop_data set.
#define Apop_col_t | ( | d, | |
colname, | |||
outd | |||
) |
After this call, v
will hold a view of an apop_data set consisting only of vector view of the col
th column of the apop_data set m
. Unlike Apop_c, the second argument is a column name, that I'll look up using apop_name_find.
#define Apop_col_tv | ( | m, | |
col, | |||
v | |||
) |
After this call, v
will hold a vector view of the col
th column of the apop_data set m
. Unlike Apop_cv, the second argument is a column name, that I'll look up using apop_name_find.
#define Apop_col_v | ( | m, | |
col, | |||
v | |||
) |
Deprecated. Use Apop_cv.
#define Apop_cols | ( | d, | |
colnum, | |||
len, | |||
outd | |||
) |
Deprecated. Use Apop_cs.
#define Apop_cs | ( | d, | |
colnum, | |||
len | |||
) |
A macro to generate a temporary view of apop_data set d
, beginning at column col
and having length len
. It expires as soon as the program leaves the current scope (like with the usual automatically declared vars).
#define Apop_cv | ( | data_to_view, | |
col | |||
) |
A macro to generate a temporary one-column view of the matrix in an apop_data set d
, pulling out only column col
. The view is a gsl_vector
set.
As usual, column -1 is the vector element of the apop_data set.
The view is automatically allocated, and disappears as soon as the program leaves the scope in which it is declared.
#define apop_data_add_names | ( | dataset, | |
type, | |||
... | |||
) | apop_data_add_names_base((dataset), (type), (char const*[]) {__VA_ARGS__, NULL}) |
Add a list of names to a data set.
NULL
, and call the base function: NULL
marker, this has good odds of segfaulting. You may prefer to use a for
loop that inserts each name in turn using apop_name_add.#define apop_data_falloc | ( | sizes, | |
... | |||
) | apop_data_fill(apop_data_alloc sizes, __VA_ARGS__) |
Allocate a data set and fill it with values. Put the data set dimensions (one, two, or three dimensions as per apop_data_alloc) in parens, then the data (as per apop_data_fill). E.g.:
If you forget the parens, you will get an obscure error during compilation.
#define apop_data_fill | ( | adfin, | |
... | |||
) | apop_data_fill_base((adfin), (double []) {__VA_ARGS__}) |
Fill a pre-allocated data set with values.
For example:
Warning: I need as many arguments as the size of the data set, and can't count them for you. Too many will be ignored; too few will produce unpredictable results, which may include padding your matrix with garbage or a simple segfault.
Underlying this function is a base function that takes a single list, as opposed to a set of unassociated numbers as above:
adfin | An apop_data set (that you have already allocated). |
... | A series of at least as many floating-point values as there are blanks in the data set. |
vector->size==matrix->size1
; otherwise I just use matrix->size1
.#define apop_data_free | ( | freeme | ) | (apop_data_free_base(freeme) ? 0 : ((freeme)= NULL)) |
Free an apop_data structure.
As with free()
, it is safe to send in a NULL
pointer (in which case the function does nothing).
If the more
pointer is not NULL
, I will free the pointed-to data set first. If you don't want to free data sets down the chain, set more=NULL
before calling this.
freeme
to NULL
when it's done, because there's nothing safe you can do with the freed location, and you can later safely test conditions like if (data) ...
. #define apop_data_prune_columns | ( | in, | |
... | |||
) |
Keep only the columns of a data set that you name.
in | The data set to prune. |
... | A list of names to retain (i.e. the columns that shouldn't be pruned out). For example, if you have run apop_data_summarize, you have columns for several statistics, but may care about only one or two; see the example. |
For example:
NULL
string at the end, and calls that function. #define apop_gaussian |
Alias for the apop_normal distribution, qv.
#define Apop_matrix_col | ( | m, | |
col, | |||
v | |||
) |
View a single column of a gsl_matrix
as a gsl_vector
. This is a convenience macro wrapping gsl_matrix_column
.
m | The gsl_matrix |
col | The number of the desired column. |
v | The name of the vector view that will be created. |
An: example
#define Apop_matrix_row | ( | m, | |
row, | |||
v | |||
) |
View a single row of a gsl_matrix
as a gsl_vector
. This is a convenience macro wrapping gsl_matrix_row
.
m | The gsl_matrix |
row | The number of the desired row. |
v | The name of the vector view that will be created. |
An: example
#define Apop_maybe_abort | ( | level | ) |
#define apop_model_coordinate_transform | ( | ... | ) | Apop_model_copy_set(apop_coordinate_transform, apop_ct, __VA_ARGS__) |
Build an apop_coordinate_transform model, qv.
#define apop_model_dcompose | ( | ... | ) | Apop_model_copy_set(apop_composition, apop_composition, __VA_ARGS__) |
Data composition is using either random draws or parameter estimates from the output of one model as the input data for another model.
apop_composition
model. #define apop_model_dconstrain | ( | ... | ) | Apop_model_copy_set(apop_dconstrain, apop_dconstrain, __VA_ARGS__) |
Build an apop_dconstrain
model, q.v., which applies a data constraint to the data set. For example, this is how one would truncate a model to have data above zero.
#define apop_model_mixture | ( | ... | ) | apop_model_mixture_base((apop_model *[]){__VA_ARGS__, NULL}) |
Produce a model as a linear combination of other models. See the documentation for the apop_mixture model.
... | A list of models, either all parameterized or all unparameterized. See examples in the apop_mixture documentation. |
#define apop_model_print_hash | ( | m1 | ) |
#define apop_model_stack | ( | ... | ) | apop_model_stack_base((apop_model *[]){__VA_ARGS__, NULL}) |
Generate a model consisting of several models bound together. The output apop_model is a copy of apop_stack; see that model's documentation for details.
Sample use:
apop_opts.verbose >= 1
. error=='n' | First model input is NULL . |
#define apop_mrow | ( | m, | |
row | |||
) |
#define Apop_notify | ( | verbosity, | |
... | |||
) |
Notify the user of errors, warning, or debug info.
verbosity | At what verbosity level should the user be warned? E.g., if level==2, then print iff apop_opts.verbosity >= 2. |
... | The message to write to STDERR (presuming the verbosity level is high enough). This can be a printf-style format with following arguments. You can produce much more informative error messages this way, e.g., apop_notify (0, "Beta is %g but should be greater than zero.", beta);. |
#define Apop_r | ( | d, | |
rownum | |||
) |
A macro to generate a temporary one-row view of apop_data set d
, pulling out only row row
. The view is also an apop_data set, with names and other decorations.
The view is automatically allocated, and disappears as soon as the program leaves the scope in which it is declared.
#define apop_rng_get_thread | ( | thread_in | ) |
The gsl_rng
is not itself thread-safe, in the sense that it can not be used simultaneously by multiple threads. However, if each thread has its own gsl_rng
, then each will safely operate independently.
Thus, Apophenia keeps an internal store of RNGs for use by threaded functions. If the input to this function, thread
, is greater than any previous input, then the array of gsl_rng
s is extended to length thread
, and each element extended using ++apop_opts.rng_seed
(i.e., the seed is incremented before use).
thread_in | The number of the RNG to retrieve, starting at zero (which is how OpenMP numbers its threads). If blank, I'll look up the current thread (via omp_get_thread_num ) for you. |
#define Apop_row | ( | d, | |
row, | |||
outd | |||
) |
Deprecated. Use Apop_r.
#define Apop_row_t | ( | d, | |
rowname, | |||
outd | |||
) |
After this call, v
will hold a view of an apop_data set consisting only of the row
th row of the apop_data set m
. Unlike Apop_r, the second argument is a row name, that I'll look up using apop_name_find.
#define Apop_row_tv | ( | m, | |
row, | |||
v | |||
) |
After this call, v
will hold a vector view of the row
th row of the apop_data set m
. Unlike Apop_rv, the second argument is a row name, that I'll look up using apop_name_find.
#define Apop_rows | ( | d, | |
rownum, | |||
len, | |||
outd | |||
) |
Deprecated. Use Apop_rs.
#define Apop_rs | ( | d, | |
rownum, | |||
len | |||
) |
A macro to generate a temporary view of apop_data set d
, beginning at row row
and having length len
. The view expires as soon as the program leaves the current scope (like with the usual automatically declared vars).
#define Apop_rv | ( | data_to_view, | |
row | |||
) |
A macro to generate a temporary one-row view of the matrix in an apop_data set d
, pulling out only row row
. The view is a gsl_vector
set.
The view is automatically allocated, and disappears as soon as the program leaves the scope in which it is declared.
#define Apop_settings_copy | ( | name, | |
... | |||
) |
A convenience macro for declaring the copy function for a new settings group. See the documentation outline -> models -> model settings -> writing new settings group for details.
To just do a direct copy, the default works; let your settings group be named ysg:
generates a function that allocates space for a new settings group and copies all elements from the input group to the output group.
The space after the comma indicates that there is no new procedural code. If you want to add some, feel free. E.g.,
The names in
and out
are built into the macro.
#define Apop_settings_declarations | ( | ysg | ) |
#define Apop_settings_free | ( | name, | |
... | |||
) |
A convenience macro for declaring the delete function for a new settings group. See the documentation outline -> models -> model settings -> writing new settings group for details.
If you don't have internal structure elements to free, let your settings group be named ysg:
generates a function that simply frees the input settings group.
If your structure is pointing to other structures that need to be freed first, then add them after that comma:
The name in
is built into the macro.
#define Apop_settings_init | ( | name, | |
... | |||
) |
A convenience macro for declaring the initialization function for a new settings group. See the documentation outline -> models -> model settings -> writing new settings group for details.
This sets the defaults for every element in the structure, so you will want a line for every element of your structure (except the ones that default to NULL, which have already been set as such).
If you need them, the input is a structure named in
, and the output a pointer-to-struct named out
.
#define Apop_stopif | ( | test, | |
onfail, | |||
level, | |||
... | |||
) |
Execute an action and print a message to stderr
(or the current FILE
handle held by apop_opts.log_file
). Intended for leaving a function on failure.
test | The expression that, if true, triggers all the action. |
onfail | If the assertion fails, do this. E.g., out->error='x'; return GSL_NAN . Notice that it is OK to include several lines of semicolon-separated code here, but if you have a lot to do, the most readable option may be goto outro , plus an appropriately-labeled section at the end of your function. |
level | Print the warning message only if apop_opts.verbose is greater than or equal to this. Zero usually works, but for minor infractions use one. |
... | The error message in printf form, plus any arguments to be inserted into the printf string. I'll provide the function name and a carriage return. |
apop_opts.stop_on_warning
is nonzero and not 'v'
, then a failed test halts via abort()
, even if the apop_opts.verbose
level is set so that the warning message doesn't print to screen. Use this when running via debugger. apop_opts.stop_on_warning
is 'v'
, then a failed test halts via abort()
iff the verbosity level is high enough to print the error. #define Apop_subm | ( | data_to_view, | |
srow, | |||
scol, | |||
nrows, | |||
ncols | |||
) |
Find the mean of the input vector. Generate a subview of a submatrix within a gsl_matrix
. Like Apop_r, et al., the view is an automatically-allocated variable that is lost once the program flow leaves the scope in which it is declared.
data_to_view | The root matrix |
srow | the first row (in the root matrix) of the top of the submatrix |
scol | the first column (in the root matrix) of the left edge of the submatrix |
nrows | number of rows in the submatrix |
ncols | number of columns in the submatrix |
#define APOP_SUBMATRIX | ( | m, | |
srow, | |||
scol, | |||
nrows, | |||
ncols, | |||
o | |||
) |
Deprecated. Use Apop_subm.
#define apop_subvector | ( | v, | |
start, | |||
len | |||
) |
#define apop_text_fill | ( | dataset, | |
... | |||
) | apop_text_fill_base((dataset), (char* []) {__VA_ARGS__, NULL}) |
Fill the text part of an already-allocated apop_data set with a list of strings.
dataset | A data set that you already prepared with apop_text_alloc. |
... | A list of strings. The first row is filled first, then the second, and so on to the end of the text grid. |
NULL
strings. A blank string, ""
is OK. apop_opts.verbose >=1
, I print a warning and continue to the end of the text grid or data set, whichever is shorter. NULL
, I return NULL
. If you provide a NULL
data set but a non-NULL list of text elements, and apop_opts.verbose >=1
, I print a warning and return NULL
. "three" "two"
to form "threetwo"
, leaving you with only five strings.NULL-delimited
array of strings (not just a loose list as above), then use apop_text_fill_base
. #define apop_update_hash | ( | m1, | |
m2 | |||
) |
#define apop_varad_declare | ( | type, | |
name, | |||
... | |||
) |
#define apop_vector_fill | ( | avfin, | |
... | |||
) | apop_vector_fill_base((avfin), (double []) {__VA_ARGS__}) |
Fill a pre-allocated gsl_vector
with values.
See apop_data_alloc
for a relevant example. See also apop_matrix_alloc
.
Warning: I need as many arguments as the size of the vector, and can't count them for you. Too many will be ignored; too few will produce unpredictable results, which may include padding your vector with garbage or a simple segfault.
avfin | A gsl_vector (that you have already allocated). |
... | A series of exactly as many values as there are spaces in the vector. |
The apop_data structure represents a data set. It primarily joins together a gsl_vector, a gsl_matrix, and a table of strings, then gives them all row and column names. It tries to be minimally intrusive, so you can use it everywhere you would use a gsl_matrix
or a gsl_vector
.
If you are viewing the HTML documentation, here is a diagram showing a sample data set with all of the elements in place. Together, they represet a data set where each row is an observation, which includes both numeric and text values, and where each row/column is named.
Rowname | Vector | Matrix | Text | Weights | ||||||||||||||||||||||||||||||||||
|
|
|
|
|
Allocate using apop_data_alloc
, free via apop_data_free
, or more generally, see the apop_data_
... section of the index (in the header links) for the many other functions that operate on this struct.
See also the Data Sets section of the outline page (also in the header links) for further notes on getting and manipulating the elements of an apop_data set.
typedef struct apop_mcmc_proposal_s apop_mcmc_proposal_s |
A proposal distribution for apop_mcmc_settings and its accompanying functions and information. By default, these will be apop_multivariate_normal models. The step_fn
and adapt_fn
have to be written around the model and your preferences. For the defaults, the step function recenters the mean of the distribution around the last accepted proposal, and the adapt function widens the Σ for the Normal if the accept rate is too low; narrows it if the accept rate is too large.
You may provide an array of proposals. The length of the list of proposals must match the number of chunks, as per the gibbs_chunks
setting in the apop_mcmc_settings group that the array of proposals is a part of. Each proposal must be initialized to include all elements, and the step and adapt functions probably have to be written anew for each type of model.
This segment of the interface is in beta. A future revision may make it easier to design new proposals.
typedef struct apop_model apop_model |
A statistical model.
apop_data* apop_anova | ( | char * | table, |
char * | data, | ||
char * | grouping1, | ||
char * | grouping2 | ||
) |
This function produces a traditional one- or two-way ANOVA table. It works from data in an SQL table, using queries of the form select data from table group by grouping1, grouping2
.
table | The table to be queried. Anything that can go in an SQL from clause is OK, so this can be a plain table name or a temp table specification like (select ... ) , with parens. |
data | The name of the column holding the count or other such data |
grouping1 | The name of the first column by which to group data |
grouping2 | If this is NULL , then the function will return a one-way ANOVA. Otherwise, the name of the second column by which to group data in a two-way ANOVA. |
int apop_arms_draw | ( | double * | out, |
gsl_rng * | r, | ||
apop_model * | m | ||
) |
Adaptive rejection metropolis sampling.
This is a function to make random draws from any univariate distribution (more or less).
The author, Wally Gilks, explains on http://www.amsta.leeds.ac.uk/~wally.gilks/adaptive.rejection/web_page/Welcome.html, that ``ARS works by constructing an envelope function of the log of the target density, which is then used in rejection sampling (see, for example, Ripley, 1987). Whenever a point is rejected by ARS, the envelope is updated to correspond more closely to the true log density, thereby reducing the chance of rejecting subsequent points. Fewer ARS rejection steps implies fewer point-evaluations of the log density.''
apop_arms_settings
structure. The structure also holds a history of the points tested to date. That means that the system will be more accurate as more draws are made. It also means that if the parameters change, or you use apop_model_copy, you should call Apop_settings_rm_group(your_model, apop_arms)
to clear the model of points that are not valid for a different situation.apop_model_add_group(your_model, apop_arms, .model=your_model, .xl=8, .xr =14);
. The model
element is mandatory; you'll get a run-time complaint if you forget it. apop_model* apop_beta_from_mean_var | ( | double | m, |
double | v | ||
) |
The Beta distribution is useful for modeling because it is bounded between zero and one, and can be either unimodal (if the variance is low) or bimodal (if the variance is high), and can have either a slant toward the bottom or top of the range (depending on the mean).
The distribution has two parameters, typically named and
, which can be difficult to interpret. However, there is a one-to-one mapping between (alpha, beta) pairs and (mean, variance) pairs. Since we have good intuition about the meaning of means and variances, this function takes in a mean and variance, calculates alpha and beta behind the scenes, and returns a random draw from the appropriate Beta distribution.
m | The mean the Beta distribution should have. Notice that m is in [0,1]. |
v | The variance which the Beta distribution should have. It is in (0, 1/12), where (1/12) is the variance of a Uniform(0,1) distribution. Funny things happen with variance near 1/12 and mean far from 1/2. |
apop_beta
model with its parameters appropriately set. out->error=='r' | Range error: mean is not within [0, 1]. |
apop_data* apop_bootstrap_cov | ( | apop_data * | data, |
apop_model * | model, | ||
gsl_rng * | rng, | ||
int | iterations, | ||
char | keep_boots, | ||
char | ignore_nans | ||
) |
Give me a data set and a model, and I'll give you the bootstrapped covariance matrix of the parameter estimates.
data | The data set. An apop_data set where each row is a single data point. (No default) |
model | An apop_model, whose estimate method will be used here. (No default) |
iterations | How many bootstrap draws should I make? (default: 1,000) |
rng | An RNG that you have initialized, probably with apop_rng_alloc . (Default: an RNG from apop_rng_get_thread) |
keep_boots | If 'y', then add a page to the output apop_data set with the statistics calculated for each bootstrap iteration. They are packed via apop_data_pack, so use apop_data_unpack if needed. (Default: 'n') |
ignore_nans | If 'y' and any of the elements in the estimation return NaN , then I will throw out that draw and try again. If 'n' , then I will write that set of statistics to the list, NaN and all. I keep count of throw-aways; if there are more than iterations elements thrown out, then I throw an error and return with estimates using data I have so far. That is, I assume that NaNs are rare edge cases; if they are as common as good data, you might want to rethink how you are using the bootstrap mechanism. (Default: 'n') |
apop_data
set whose matrix element is the estimated covariance matrix of the parameters. out->error=='n' | NULL input data. |
out->error=='N' | too many Nans.
|
void apop_crosstab_to_db | ( | apop_data * | in, |
char * | tabname, | ||
char * | row_col_name, | ||
char * | col_col_name, | ||
char * | data_col_name | ||
) |
See apop_db_to_crosstab for the storyline; this is the complement, which takes a crosstab and writes its values to the database.
For example, I would take
c0 | c1 | |
r0 | 2 | 3 |
r1 | 0 | 4 |
and do the following writes to the database:
r0
, r1
, ... c0
, c1
, .... Text columns get their own numbering system, t0
, t1
, ..., which is a little more robust than continuing the column count from the matrix.void apop_data_add_named_elmt | ( | apop_data * | d, |
char * | name, | ||
double | val | ||
) |
A convenience function to add a named element to a data set. Many of Apophenia's testing procedures use this to easily produce a column of named parameters. It is public as a convenience.
d | The apop_data structure. Must not be NULL , but may be blank (as per allocation via apop_data_alloc ( ) ). |
name | The name to add |
val | the value to add to the set. |
NULL
), I will call apop_vector_realloc internally to make space. Add a page to a apop_data set. It gets a name so you can find it later.
dataset | The input data set, to which a page will be added. |
newpage | The page to append |
title | The name of the new page. Remember, this is truncated at 100 characters. |
NULL
data set and apop_opts.verbose >=1
.<Covariance>
, will be considered informational and ignored by search routines, missing data routines, et cetera. This is achieved by a rule in apop_data_pack and apop_data_unpack.Here is a toy example that establishes a baseline data set, adds a page, modifies it, and then later retrieves it.
apop_data* apop_data_alloc | ( | const size_t | size1, |
const size_t | size2, | ||
const int | size3 | ||
) |
Allocate a apop_data structure, to be filled with data.
apop_data_alloc(2,3,4)
: vector size, matrix rows, matrix cols. If the first argument is zero, you get a NULL
vector. apop_data_alloc(2,3)
, would allocate just a matrix, leaving the vector NULL
. apop_data_alloc(2)
, would allocate just a vector, leaving the matrix NULL
. apop_data_alloc()
, will produce a basically blank set, with out->matrix==out->vector==NULL
.For allocating the text part, see apop_text_alloc.
The weights
vector is set to NULL
. If you need it, allocate it via
out->error=='a' | Allocation error. The matrix, vector, or names couldn't be malloc ed, which probably means that you requested a very large data set. |
NULL
. But if even this much fails, your computer may be on fire and you should go put it out.apop_data* apop_data_calloc | ( | const size_t | size1, |
const size_t | size2, | ||
const int | size3 | ||
) |
Allocate a apop_data structure, to be filled with data; set everything in the allocated portion to zero. See apop_data_alloc for details.
out->error=='m' | malloc error; probably out of memory. |
Copy one apop_data structure to another. That is, all data is duplicated.
Basically a front-end for apop_data_memcpy for those who prefer this sort of syntax.
Unlike apop_data_memcpy, I do follow the more
pointer.
in | the input data |
out.error='a' | Allocation error. |
out.error='c' | Cyclic link: D->more == D (may be later in the chain, e.g., D->more->more = D->more ) You'll have only a partial copy. |
out.error='d' | Dimension error; should never happen. |
out.error='p' | Missing part error; should never happen.
|
Returns the matrix of correlation coefficients relating each column with each other.
in | A data matrix: rows are observations, columns are variables. If you give me a weights vector, I'll use it. |
out->error='a' | Allocation error. |
Returns the sample variance/covariance matrix relating each column of the matrix to each other column.
in | An apop_data set. If the weights vector is set, I'll take it into account. |
out->error='a' | Allocation error. |
char apop_data_free_base | ( | apop_data * | freeme | ) |
Free the elements of the given apop_data set and then the apop_data set itself. Intended to be used by apop_data_free, a macro that calls this to free elements, then sets the value to NULL
.
NULL
. For typical cases, that's slightly more useful than this function.freeme.error='c' | Circular linking is against the rules. If freeme->more == freeme , then I set freeme.error='c' and return. If you send in a structure like A -> B -> B, then both data sets A and B will be marked. |
0
on OK, 'c'
on error. Factor names are stored in an auxiliary table with a name like "<categories for your_var>"
. Producing this name is annoying (and prevents us from eventually making it human-language independent), so use this function to get the list of factor names.
data | The data set. (No default, must not be NULL ) |
col | The column in the main data set whose name I'll use to check for the factor name list. Vector==-1. (default=0) |
type | If you are referring to a text column, use 't'. (default='d') |
It's good form to get a page from your data set by name, because you may not know the order for the pages, and the stepping through makes for dull code anyway (apop_data *page = dataset; while (page->more) page= page->more;
).
data | The apop_data set to use. No default; if NULL , gives a warning if apop_opts.verbose >=1 and returns NULL . |
title | The name of the page to retrieve. Default="Info" , which is the name of the page of additional estimation information returned by estimation routines (log likelihood, status, AIC, BIC, confidence intervals, ...). |
match | If 'c' , case-insensitive match (via strcasecmp ); if 'e' , exact match, if 'r' regular expression substring search (via apop_regex). Default='c' . |
NULL
.If there is an NaN anywhere in the row of data (including the matrix, the vector, the weights, and the text) then delete the row from the data set.
NULL
. apop_opts.nan_string
is not NULL
, then I will make case-insensitive comparisons to the text elements to check for bad data as well. inplace
= 'y', then I'll free each element of the input data set and refill it with the pruned elements. I'll still take up (up to) twice the size of the data set in memory during the function. If every row has an NaN, then your apop_data
set will end up with NULL
vector, matrix, .... if inplace
= 'n', then the original data set is left unmolested. more
element is ignored). d | The data, with NaNs |
inplace | If 'y' , clear out the pointer-to-apop_data that you sent in and refill with the pruned data. If 'n' , leave the set alone and return a new data set. Default='n' . |
inplace=='y'
, a pointer to the input, which was shortened in place. If the entire data set is cleared out, then this will be NULL
. Copy one apop_data structure to another.
This function does not allocate the output structure or the vector, matrix, text, or weights elements—I assume you have already done this and got the dimensions right. I will assert that there is at least enough room in the destination for your data, and fail if the copy would write more elements than there are bins.
in
and out
have a more
pointer, also copy subsequent page(s). out | A structure that this function will fill. Must be preallocated with the appropriate sizes. |
in | The input data. |
out.error='d' | Dimension error; couldn't copy. |
out.error='p' | Part missing; e.g., in->matrix exists but out->matrix doesn't; couldn't copy. |
Say that you have added a long list of observations to a single apop_data set, meaning that each row has weight one. There are a huge number of duplicates, perhaps because there are a handful of types that keep repeating:
Vector value | Text name | Weights |
12 | Dozen | 1 |
1 | Single | 1 |
2 | Pair | 1 |
2 | Pair | 1 |
1 | Single | 1 |
1 | Single | 1 |
2 | Pair | 1 |
2 | Pair | 1 |
You would like to reduce this to a set of distinct values, with their weights adjusted accordingly:
Vector value | Text name | Weights |
12 | Dozen | 1 |
1 | Single | 3 |
2 | Pair | 4 |
in | An apop_data set that may have duplicate rows. As above, the data may be in text and/or numeric formats. If there is a weights vector, I will add those weights together as duplicates are merged. If there is no weights vector, I will create one, which is initially set to one for all values, and then aggregated as above. |
Keep only the columns of a data set that you name. This is the function called internally by the apop_data_prune_columns macro. In most cases, you'll want to use that macro. An example of the two uses demonstrating the difference:
d | The data set to prune. |
colnames | A null-terminated list of names to retain (i.e. the columns that shouldn't be pruned out). |
One often finds data where the column indicates the value of the data point. There may be two columns, and a mark in the first indicates a miss while a mark in the second is a hit. Or say that we have the following list of observations:
Then we could write this as:
because there are six 1s observed, four 2s observed, and two 3s observed. We call this rank format, because 1 (or zero) is typically the most common, 2 is second most common, et cetera.
This function takes in a list of observations, and aggregates them into a single row in rank format.
0 0 1 1 0
, then I won't know to generate results with three bins where the last bin has probability zero.The complement to this is apop_data_rank_compress; see that function's documentation for the story and an example.
This function takes in a data set where the zeroth column includes the count(s) of times that zero was observed, the first gives the count(s) of times that one was observed, et cetera. It outputs a data set whose vector element includes a list that has exactly the given frequency of zeros, ones, et cetera.
void apop_data_rm_columns | ( | apop_data * | d, |
int * | drop | ||
) |
Remove the columns set to one in the drop
vector. The returned data structure looks like it was modified in place, but the data matrix and the names are duplicated before being pared down, so if your data is taking up more than half of your memory, this may not work.
d | the apop_data structure to be pared down. |
drop | an array of ints. If use[7]==1, then column seven will be cut from the output. A reminder: calloc(in->size2 , sizeof(int)) will fill your array with zeros on allocation, and memset(use, 1, in->size2 * sizeof(int)) will quickly fill an array of ints with nonzero values. |
Remove the first page from an apop_data set that matches a given name.
data | The input data set, to which a page will be added. No default. If NULL , I return silently if apop_opts.verbose < 1 ; print an error otherwise. |
title | The case-insensitive name of the page to remove. Default: "Info" |
free_p | If 'y' , then apop_data_free the page. Default: 'y' . |
apop_data
page that I just pulled out. Thus, you can use this to pull a single page from a data set. I set that page's more
pointer to NULL
, to minimize any confusion about more-than-linear linked list topologies. If free_p=='y'
(the default) or the page is not found, return NULL
.->more
pointer in the apop_data set is not to fully implement a linked list, but primarily to allow you to staple auxiliary information to a main data set.apop_opts.verbose >= 1
. apop_data* apop_data_rm_rows | ( | apop_data * | in, |
int * | drop, | ||
apop_fn_ir | do_drop, | ||
void * | drop_parameter | ||
) |
Remove the rows set to one in the drop
vector or for which the do_drop
function returns one.
in | the apop_data structure to be pared down |
drop | a vector with as many elements as the max of the vector, matrix, or text parts of in , with a one marking those columns to be removed. |
do_drop | A function that returns one for rows to drop and zero for rows to not drop. A sample function: (onerow, .row=0, .col=0) , which can help to keep things readable). |
drop_parameter | If your do_drop function requires additional input, put it here and it will be passed through. |
NULL
vector
, matrix
, weight
, and text. Therefore, you may wish to check for NULL
elements after use. I remove rownames, but leave the other names, in case you want to add new data rows.NULL
, I return without doing anything, and print a warning if apop_opts.verbose >=1
. If you provide both, I will drop the row if either the vector has a one in that row's position, or if the function returns a nonzero value. Now that you've used Apop_r to pull a row from an apop_data set, this function lets you write that row to another position in the same data set or a different data set entirely.
The set written to must have the same form as the original:
d->names->rowct == row_number
(all rows up to row_number
have row names), then extend the list of row names by one to add the new name. Else, don't add the row name. apop_data_copy
or apop_name_copy
if you need to copy these names.If any of the source elements are NULL
, I won't bother to check that element in the destination.
_Thread_local
keyword) or a version of GCC with the __thread
extension enabled. apop_data* apop_data_sort | ( | apop_data * | data, |
apop_data * | sort_order, | ||
char | asc, | ||
char | inplace, | ||
double * | col_order | ||
) |
Sort an apop_data set on an arbitrary sequence of columns.
The sort_order
set is a one-row data set that should look like the data set being sorted. The easiest way to generate it is to use Apop_r to pull one row of the table, then copy and fill it. For each column you want used in the sort, assign a ranking giving whether the column should be sorted first, second, .... Columns you don't want used in the sorting should be set to NAN
. Ties are broken by the earlier element in the default order (see below).
E.g., to sort by the last column of a five-column matrix first, then the next-to-last column, then the next-to-next-to-last, then by the first text column, then by the second text column:
I use only comparisons, not the actual numeric values, so you can use any sequence of numbers: (1, 2, 3) and (-1.32, 0, 27) work identically.
strcasecmp
. [exercise for the reader: modify the source to use Glib's locale-correct string sorting.]data | The data set to be sorted. If NULL , this function is a no-op that returns NULL . |
sort_order | A apop_data set describing the order in which columns are used for sorting, as above. If NULL , then sort by the vector, then each matrix column, then text, then weights, then row names. |
inplace | If 'n', make a copy, else sort in place. (default: 'y'). |
asc | If 'a', ascending; if 'd', descending. This is applied to all columns; column-by-column application is to do. (default: 'a'). |
col_order | For internal use only. In your call, it should be NULL ; the Designated initializers syntax will takes care of it for you. |
inplace=='y'
(the default), then this is the same as the input set.A few examples:
Split one input apop_data structure into two.
For the opposite operation, see apop_data_stack.
in | The apop_data structure to split |
splitpoint | The index of what will be the first row/column of the second data set. E.g., if this is -1 and r_or_c=='c' , then the whole data set will be in the second data set; if this is the length of the matrix then the whole data set will be in the first data set. Another way to put it is that splitpoint will equal the number of rows/columns in the first matrix (unless it is -1, in which case the first matrix will have zero rows, or it is greater than the matrix's size, in which case it will have as many rows as the original). |
r_or_c | If this is 'r' or 'R', then put some rows in the first data set and some in the second; of 'c' or 'C', split columns into first and second data sets. |
NULL
pointer will be returned in that position. For example, for a data set of 50 rows, apop_data **out = apop_data_split(data, 100, 'r')
sets out[0] = apop_data_copy(data)
and out[1] = NULL
.more
pointer is ignored. apop_data->vector
is taken to be the -1st element of the matrix. apop_data_free(in)
after this. Put the first data set either on top of or to the left of the second data set.
The fn returns a new data set, meaning that at the end of this function, until you apop_data_free() the original data sets, you will be taking up twice as much memory. Plan accordingly.
For the opposite operation, see apop_data_split.
m1 | the upper/rightmost data set (default = NULL ) |
m2 | the second data set (default = NULL ) |
posn | If 'r', stack rows of m1's matrix above rows of m2's if 'c', stack columns of m1's matrix to left of m2's (default = 'r') |
inplace | If 'y' , use apop_matrix_realloc and apop_vector_realloc to modify m1 in place; see the caveats on those function. Otherwise, allocate a new vector, leaving m1 unmolested. (default='n') |
m1
out->error=='a' | Allocation error. |
out->error=='d' | Dimension error; couldn't make a complete copy. |
m2
is NULL
and inplace
is 'y'
, where you'll get the original m1
pointer back) more
is ignored. m1
, with the names for m2
appended to the row or column list, as appropriate. apop_data* apop_data_to_bins | ( | apop_data * | indata, |
apop_data * | binspec, | ||
int | bin_count, | ||
char | close_top_bin | ||
) |
Create a histogram from data by putting data into bins of fixed width.
indata | The input data that will be binned. This is copied and the copy will be modified. |
close_top_bin | Normally, a bin covers the range from the point equal to its minimum to points strictly less than the minimum plus the width. if 'y' , then the top bin includes points less than or equal to the upper bound. This solves the problem of displaying histograms where the top bin is just one point. |
binspec | This is an apop_data set with the same number of columns as indata . If you want a fixed size for the bins, then the first row of the bin spec is the bin width for each column. This allows you to specify a width for each dimension, or specify the same size for all with something like: |
bin_count | If you don't provide a bin spec, I'll provide this many evenly-sized bins. Default: ![]() ![]() |
<binspec>
, so you can snap a second data set to the same grid using The text segment, if any, is not binned. I use apop_data_pmf_compress as the final step in the binning, and that does respect the text segment.
Here is a sample program highlighting the difference between apop_data_to_bins and apop_data_pmf_compress .
apop_data* apop_data_to_dummies | ( | apop_data * | d, |
int | col, | ||
char | type, | ||
int | keep_first, | ||
char | append, | ||
char | remove | ||
) |
A utility to make a matrix of dummy variables. You give me a single vector that lists the category number for each item, and I'll produce a matrix with a single one in each row in the column specified.
After that, you have to decide what to do with the new matrix and the original data column.
.remove='y'
option specifies that I should use apop_data_rm_columns to remove the column used to generate the dummies. Implemented only for type=='d'
..append='y'
or .append='e'
I will run the above two lines for you. Your apop_data pointer will not change, but its matrix
element will be reallocated (via apop_data_stack)..append='i'
, I will place the matrix of dummies in place, immediately after the data column you had specified. You will probably use this with .remove='y'
to replace the single column with the new set of dummy columns. Bear in mind that if there are two or more dummy columns (which there probably are if you are bothering to use this function), subsequent column numbers will change..append='i'
and you asked for a text column, I will append to the end of the table, which is equivalent to append='e'
.d | The data set with the column to be dummified (No default.) |
col | The column number to be transformed; -1==vector (default = 0) |
type | 'd'==data column, 't'==text column. (default = 't') |
keep_first | if zero, return a matrix where each row has a one in the (column specified MINUS ONE). That is, the zeroth category is dropped, the first category has an entry in column zero, et cetera. If you don't know why this is useful, then this is what you need. If you know what you're doing and need something special, set this to one and the first category won't be dropped. (default = 0) |
append | If 'e' or 'y' , append the dummy grid to the end of the original data matrix. If 'i' , insert in place, immediately after the original data column. (default = 'n' ) |
remove | If 'y' , remove the original data or text column. (default = 'n' ) |
matrix
element is the one-zero matrix of dummies. If you used .append
, then this is the main matrix. Also, I add a page named "\<categories for your_var\>"
giving a reference table of names and column numbers (where your_var
is the appropriate column heading). out->error=='a' | allocation error |
out->error=='d' | dimension error
|
Convert a column of text or numbers into a column of numeric factors, which you can use for a multinomial probit/logit, for example.
If you don't run this on your data first, apop_probit and apop_logit default to running it on the vector or (if no vector) zeroth column of the matrix of the input apop_data set, because those models need a list of the unique values of the dependent variable.
data | The data set to be modified in place. (No default. If NULL , returns NULL and a warning) |
intype | If 't' , then incol refers to text, otherwise ('d' is a good choice) refers to the vector or matrix. Default = 't' . |
incol | The column in the text that will be converted. -1 is the vector. Default = 0. |
outcol | The column in the data set where the numeric factors will be written (-1 means the vector). Default = 0. |
For example:
Notice that the query pulled a column of ones for the sake of saving room for the factors. It reads column zero of the text, and writes it to column zero of the matrix.
Another example:
Here, the type
column is converted to sequential integer factors and those factors overwrite the original data. Since a reference table is added as a second page of the apop_data set, you can recover the original values as needed.
apop_data
set with only one column of text. Also, I add a page named "<categories for your_var>"
giving a reference table of names and column numbers (where your_var
is the appropriate column heading) use apop_data_get_factor_names to retrieve that table.out->error=='a' | allocation error. |
out->error=='d' | dimension error.
|
Transpose the matrix and text elements of the input data set, including the row/column names.
The vector and weights elements of the input data set are completely ignored (but see also apop_vector_to_matrix, which can convert a vector to a 1 X N matrix.) If copying, these other elements won't be present; if .inplace='y'
, it is up to you to handle these not-transposed elements correctly.
in | The input apop_data set. If NULL , I return NULL . Default is NULL . |
transpose_text | If 'y' , then also transpose the text element. Default is 'y' . |
inplace | If 'y' , transpose the input in place; if 'n' , produce a transposed copy, leaving the original untouched. Due to how gsl_matrix_transpose_memcpy works, a copy will still be made, then copied to the original location. Default is 'y' . |
inplace=='n'
, a newly alloced apop_data set, with the appropriately transposed matrix and/or text. The vector and weights elements will be NULL
. If transpose_text='n'
, then the text element of the output set will also be NULL
.inplace=='y'
, a pointer to the original data set, with matrix and (if transpose_text='y'
) text transposed and vector and weights left in place untouched.gsl_matrix
with no names or text, you may prefer to use gsl_matrix_transpose_memcpy
.int apop_db_close | ( | char | vacuum | ) |
Closes the database on disk. If you opened the database with apop_db_open(NULL)
, then this is basically optional.
vacuum | 'v': vacuum—do clean-up to minimize the size of the database on disk. 'q': Don't bother; just close the database. (default = 'q') |
int apop_db_open | ( | char const * | filename | ) |
If you want to use a database on the hard drive instead of memory, then call this once and only once before using any other database utilities.
If you want a disposable database which you won't use after the program ends, don't bother with this function.
The trade-offs between an on-disk database and an in-memory db are as one would expect: memory is faster, but is destroyed when the program exits. SQLite includes a command line utility (sqlite3
) which let you ask queries of a database on disk, which may be useful for debugging. There are also some graphical front-ends; just ask your favorite search engine for SQLite GUI.
MySQL users: either set the environment variable APOP_DB_ENGINE=mysql or set apop_opts.db_engine
= 'm'.
The Apophenia package assumes you are only using a single SQLite database at a time. You can use the SQL attach
function to load other databases, or see this blog post for further suggestions and sample code.
When you are done doing your database manipulations, be sure to call apop_db_close if writing to disk.
filename | The name of a file on the hard drive on which to store the database. If NULL , then the database will be kept in memory (in which case, the other database functions will call this function for you and you don't need to bother). |
apop_data* apop_db_to_crosstab | ( | char * | tabname, |
char * | r1, | ||
char * | r2, | ||
char * | datacol | ||
) |
Give the name of a table in the database, and names of three of its columns: the x-dimension, the y-dimension, and the data. the output is a 2D matrix with rows indexed by r1 and cols by r2.
tabname | The database table I'm querying. Anything that will work inside a from clause is OK, such as a subquery in parens. |
r1 | The column of the data set that will indicate the rows of the output crosstab |
r2 | The column of the data set that will indicate the columns of the output crosstab |
datacol | The column of the data set holding the data for the cells of the crosstab |
NULL
data set and if apop_opts.verbosity >= 1
print a warning.select
query to generate the data. One is to specify an ad hoc table to pull from:The other is to use the fact that the table name will be at the end of the query, so you can add conditions to the table:
out->error='n' | Name not found error. |
out->error='q' | Query returned an empty table (which might mean that it just failed). |
void apop_estimate_parameter_tests | ( | apop_model * | est | ) |
For many, it is a knee-jerk reaction to a parameter estimation to test whether each individual parameter differs from zero. This function does that.
est | The apop_model, which includes pre-calculated parameter estimates, var-covar matrix, and the original data set. |
Returns nothing. At the end of the routine, est->info->more
includes a set of t-test values: p value, confidence (=1-pval), t statistic, standard deviation, one-tailed Pval, one-tailed confidence.
apop_data* apop_f_test | ( | apop_model * | est, |
apop_data * | contrast | ||
) |
Runs an F-test specified by q
and c
. Your best bet is to see the chapter on hypothesis testing in Modeling With Data, p 309. It will tell you that:
and that's what this function is based on.
est | an apop_model that you have already calculated. (No default) |
contrast | The matrix ![]() ![]() NULL , it is set to the identity matrix with the top row missing. If the vector is NULL , it is set to a zero matrix of length equal to the height of the contrast matrix. Thus, if the entire apop_data set is NULL or omitted, we are testing the hypothesis that all but ![]() |
apop_data
set with a few variants on the confidence with which we can reject the joint hypothesis. NULL
contrast set, I will generate the set of linear contrasts that are equivalent to the ANOVA-type approach. Readers of {Modeling with Data}, note that there's a bug in the book that claims that the traditional ANOVA approach also checks that the coefficient for the constant term is also zero; this is not the custom and doesn't produce the equivalence presented in that and other textbooks.out->error='a' | Allocation error. |
out->error='d' | dimension-matching error. |
out->error='i' | matrix inversion error. |
out->error='m' | GSL math error.
|
double apop_generalized_harmonic | ( | int | N, |
double | s | ||
) |
Calculate
N
is zero or negative, return NaN. Notify the user if apop_opts.verbosity >=1
For example:
apop_data* apop_jackknife_cov | ( | apop_data * | in, |
apop_model * | model | ||
) |
Give me a data set and a model, and I'll give you the jackknifed covariance matrix of the model parameters.
The basic algorithm for the jackknife (with many details glossed over): create a sequence of data sets, each with exactly one observation removed, and then produce a new set of parameter estimates using that slightly shortened data set. Then, find the covariance matrix of the derived parameters.
Jackknife or bootstrap? As a broad rule of thumb, the jackknife works best on models that are closer to linear. The worse a linear approximation does (at the given data), the worse the jackknife approximates the variance.
Sample usage:
in | The data set. An apop_data set where each row is a single data point. |
model | An apop_model, that will be used internally by apop_estimate. |
out->error=='n' | NULL input data. |
apop_data
set whose matrix element is the estimated covariance matrix of the parameters. double apop_kl_divergence | ( | apop_model * | from, |
apop_model * | to, | ||
int | draw_ct, | ||
gsl_rng * | rng | ||
) |
Kullback-Leibler divergence.
This measure of the divergence of one distribution from another has the form . Notice that it is not a distance, because there is an asymmetry between
and
, so one can expect that
.
from | the ![]() NULL ) |
to | the ![]() NULL ) |
draw_ct | If I do the calculation via random draws, how many? (Default = 1e5) |
rng | A gsl_rng . If NULL or number of threads is greater than 1, I'll take care of the RNG; see apop_rng_get_thread. (Default = NULL ) |
This function can take empirical histogram-type models (apop_pmf) or continuous models like apop_loess or apop_normal.
If there is a PMF (I'll try from
first, under the presumption that you are measuring the divergence of data from an observed data distribution), then I'll step through it for the points in the summation.
GSL_NEGINF
. If apop_opts.verbose >=1
I print a message as well.If neither distribution is a PMF, then I'll take draw_ct
random draws from to
and evaluate at those points.
apop_opts.verbose = 3
for observation-by-observation info.long double apop_linear_constraint | ( | gsl_vector * | beta, |
apop_data * | constraint, | ||
double | margin | ||
) |
This is designed to be called from within the constraint method of your apop_model. Just write the constraint vector+matrix and this will do the rest. See the outline page for detailed discussion on setting contrasts.
beta | The proposed vector about to be tested. No default, must not be NULL . |
constraint | A vector/matrix pair [v | m1 m2 ... mn] where each row is interpreted as a less-than inequality: ![]() ![]() ![]() ![]() |
margin | If zero, then this is a >= constraint, otherwise I will return a point this amount within the borders. You could try GSL_DBL_EPSILON , which is the smallest value a double can hold, or something like 1e-3. Default = 0. |
return The penalty = the distance between beta and the closest point that meets the constraints. If the constraint is not met, this beta
is shifted by margin
(Euclidean distance) to meet the constraints.
int apop_matrix_is_positive_semidefinite | ( | gsl_matrix * | m, |
char | semi | ||
) |
Test whether the input matrix is positive semidefinite.
A covariance matrix will always be PSD, so this function can tell you whether your matrix is a valid covariance matrix.
Consider the 1x1 matrix in the upper left of the input, then the 2x2 matrix in the upper left, on up to the full matrix. If the matrix is PSD, then each of these has a positive determinant. This function thus calculates determinants for an
x
matrix.
m | The matrix to test. If NULL , I will return zero—not PSD. |
semi | If anything but 's', check for positive definite, not semidefinite. (default 's') |
See also apop_matrix_to_positive_semidefinite, which will change the input to something PSD.
void apop_matrix_normalize | ( | gsl_matrix * | data, |
const char | row_or_col, | ||
const char | normalization | ||
) |
Normalize each row or column in the given matrix, one by one.
Basically just a convenience fn to iterate through the columns or rows and run apop_vector_normalize for you.
data | The data set to normalize. |
row_or_col | Either 'r' or 'c'. |
normalization | see apop_vector_normalize. |
gsl_matrix* apop_matrix_realloc | ( | gsl_matrix * | m, |
size_t | newheight, | ||
size_t | newwidth | ||
) |
This function will resize a gsl_matrix
to a new height or width.
Data in the matrix will be retained. If the new height or width is smaller than the old, then data in the later rows/columns will be cropped away (in a non–memory-leaking manner). If the new height or width is larger than the old, then new cells will be filled with garbage; it is your responsibility to zero out or otherwise fill new rows/columns before use.
Warning I: Using this function is basically bad form—especially when used in a for
loop that adds a column each time. A large number of realloc
s can take a noticeable amount of time. You are thus encouraged to make an effort to determine the size of your data beforehand.
Warning II: The gsl_matrix
is a versatile struct that can represent submatrices and other cuts from parent data. I can't deal with those, and check for such situations beforehand. [Besides, resizing a portion of a parent matrix makes no sense.]
m | The already-allocated matrix to resize. If you give me NULL , this becomes equivalent to gsl_matrix_alloc |
newheight,newwidth | The height and width you'd like the matrix to be. |
gsl_matrix* apop_matrix_rm_columns | ( | gsl_matrix * | in, |
int * | drop | ||
) |
Delete columns from a matrix.
This is done via copying, so if you have an exceptionally large data set, you're better off producing the matrix in the perfect form directly.
in | the gsl_matrix to be subsetted |
gsl_matrix
with the specified columns removed. If you ask me to remove no columns, I'll return a copy of the original. If you ask me to remove all columns, I'll return NULL
. drop | an array of int s. If use[7]==1, then column seven will be cut from the output. |
apop_data* apop_matrix_to_data | ( | gsl_matrix * | m | ) |
Deprecated; please do not use. Just use a compound literal:
double apop_matrix_to_positive_semidefinite | ( | gsl_matrix * | m | ) |
First, this function passes tests, but is under development.
It takes in a matrix and converts it to the `closest' positive semidefinite matrix.
m | On input, any matrix; on output, a positive semidefinite matrix. |
Adapted from the R Matrix package's nearPD, which is Copyright (2007) Jens Oehlschlägel [and is GPL].
apop_model* apop_ml_impute | ( | apop_data * | d, |
apop_model * | mvn | ||
) |
Impute the most likely data points to replace NaNs in the data, and insert them into the given data. That is, the data set is modified in place.
How it works: this uses the machinery for apop_model_fix_params. The only difference is that this searches over the data space and takes the parameter space as fixed, while basic fix params model searches parameters and takes data as fixed. So this function just does the necessary data-parameter switching to make that happen.
d | The data set. It comes in with NaNs and leaves entirely filled in. |
mvn | A parametrized apop_model from which you expect the data was derived. if NULL , then I'll use the Multivariate Normal that best fits the data after listwise deletion. |
apop_ml_impute_model
. Also, the data input will be filled in and ready to use. apop_data* apop_model_draws | ( | apop_model * | model, |
int | count, | ||
apop_data * | draws | ||
) |
Make a set of random draws from a model and write them to an apop_data set.
model | The model from which draws will be made. Must already be prepared and/or estimated. |
count | The number of draws to make. If draw_matrix is not NULL , then this is ignored and count=draw_matrix->matrix->size1 . default=1000. |
draws | If not NULL , a pre-allocated data set whose matrix element will be filled with draws. |
size
draws. If draw_matrix!=NULL
, then return a pointer to it.out->error=='m' | Input model isn't good for making draws: it is NULL , or m->dsize=0 . |
out->error=='s' | You gave me a draws matrix, but its size is less than the size of a single draw from the data, model->dsize . |
out->error=='d' | Trouble drawing from the distribution for at least one row. That row is set to all NAN . |
NULL apop_data
set, but its matrix
element is NULL
, when apop_opts.verbose>=1
.Here is a two-line program to draw a different set of ten Standard Normals on every run (provided runs are more than a second apart):
apop_model* apop_model_fix_params | ( | apop_model * | model_in | ) |
Produce a model based on another model, but with some of the parameters fixed at a given value.
You will send me the model whose parameters you want fixed, with the parameters
element set as follows. For the fixed parameters, simply give the values to which they will be fixed. Set the free parameters to NaN
.
For example, here is a Binomial distribution with a fixed but
allowed to float freely:
The output is an apop_model
that can be estimated, Bayesian updated, et cetera.
estimate
method always uses an MLE, and it never calls the base model's estimate
method.estimate
method. Otherwise, I'll set my own.more
pointer of the parameters
for additional pages and NaN
s on those pages.Here is a sample program. It produces a few thousand draws from a Multivariate Normal distribution, and then tries to recover the means given a var/covar matrix fixed at the correct variance.
model_in | The base model |
apop_model* apop_model_fix_params_get_base | ( | apop_model * | fixed_model | ) |
The apop_model_fix_params function produces a model that has only the non-fixed parameters of the model. After estimation of the fixed-parameter model, this function fills the parameters
element of the base model and returns a pointer to the base model.
apop_data* apop_model_hessian | ( | apop_data * | data, |
apop_model * | model, | ||
double | delta | ||
) |
Numerically estimate the matrix of second derivatives of the parameter values. The math is simply a series of re-evaluations at small differential steps. [Therefore, it may be expensive to do this for a very computationally-intensive model.]
data | The data at which the model was estimated |
model | The model, with parameters already estimated |
delta | the step size for the differentials. The current default is around 1e-3. |
apop_model* apop_model_metropolis | ( | apop_data * | d, |
gsl_rng * | rng, | ||
apop_model * | m | ||
) |
Use Metropolis-Hastings Markov chain Monte Carlo to make draws from the given model.
The basic storyline is that draws are made from a proposal distribution, and the likelihood of your model given your data and the drawn parameters evaluated. At each step, a new set of proposal parameters are drawn, and if either they are more likely than the previous set the new proposal is accepted as the next step, else with probability (prob of new params)/(prob of old params), they are accepted as the next step anyway. Otherwise the last accepted proposal is repeated.
The output is an apop_pmf model with a data set listing the draws that were accepted, including those repetitions. The output model is modified so that subsequent draws are one more step from the Markov chain, via apop_model_metropolis_draw.
constraint
element of the model you input, then the proposal is thrown out and a new one selected. By the default proposal distribution, this is not mathematically correct (it breaks detailed balance), and values near the constraint will be oversampled. The output model will have outmodel->error=='c'
. It is up to you to decide whether the resulting distribution is good enough for your purposes or whether to take the time to write a custom proposal and step function to accommodate the constraint.Attach an apop_mcmc_settings group to your model to specify the proposal distribution, burnin, and other details of the search. See the apop_mcmc_settings documentation for details.
base_adapt_fn
in the apop_mcmc_settings group to a do-nothing function, or one that damps its adaptation as gibbs_chunks
element of the apop_mcmc_settings group. If you set gibbs_chunks='a'
, all parameters are drawn as a set, and accepted/rejected as a set. The variances are adapted at an identical rate. If you set gibbs_chunks='i'
, then each scalar parameter is assigned its own proposal distribution, which is adapted at its own pace. With gibbs_chunks='b'
(the default), then each of the vector, matrix, and weights of your model's parameters are drawn/accepted/adapted as a group (and so on to additional chunks if your model has ->more
pages). This works well for complex models which naturally break down into subsets of parameters.Each chunk counts as a step in the Markov chain. Therefore, if there are several chunks, you can expect chunks to repeat from step to step. If you want a draw after cycling through all chunks, try using apop_model_metropolis_draw, which has that behavior.
d | The apop_data set used for evaluating the likelihood of a proposed parameter set. |
rng | A gsl_rng , probably allocated via apop_rng_alloc. (Default: an RNG from apop_rng_get_thread) |
m | The apop_model from which parameters are being drawn. (No default; must not be NULL ) |
parameters
element of your likelihood model has the last accepted parameter proposal.apop_opts.verbose=2
or greater, I will report the accept rate of the M-H sampler. It is a common rule of thumb to select a proposal so that this is between 20% and 50%. Set apop_opts.verbose=3
to see the stream of proposal points, their likelihoods, and the acceptance odds. You may want to set apop_opts.log_file=fopen("yourlog", "w")
first.draw
method that returns another step from the Markov chain with each draw.out->error='c' | Proposal was outside of a constraint; see above. |
apop_data* apop_model_numerical_covariance | ( | apop_data * | data, |
apop_model * | model, | ||
double | delta | ||
) |
Produce the covariance matrix for the parameters of an estimated model via the derivative of the score function at the parameter. I.e., I find the second derivative via apop_model_hessian , and take the negation of the inverse.
I follow Efron and Hinkley in using the estimated information matrix—the value of the information matrix at the estimated value of the score—not the expected information matrix that is the integral over all possible data. See Pawitan 2001 (who cribbed a little off of Efron and Hinkley) or Klemens 2008 (who directly cribbed off of both) for further details.
data | The data by which your model was estimated |
model | A model whose parameters have been estimated. |
delta | The differential by which to step for sampling changes. (default currently = 1e-3) |
"Covariance"
page, I'll set it to the result as well [i.e., I won't overwrite an existing covar].This function uses the Designated initializers syntax for inputs.
long double apop_multivariate_gamma | ( | double | a, |
int | p | ||
) |
The multivariate generalization of the Gamma distribution.
Because is undefined for
, this function returns
NAN
when takes on one of those values.
See also apop_multivariate_lngamma, which is more numerically stable in most cases.
long double apop_multivariate_lngamma | ( | double | a, |
int | p | ||
) |
The log of the multivariate generalization of the Gamma; see also apop_multivariate_gamma.
int apop_name_add | ( | apop_name * | n, |
char const * | add_me, | ||
char | type | ||
) |
Adds a name to the apop_name structure. Puts it at the end of the given list.
n | An existing, allocated apop_name structure. |
add_me | A string. If NULL , do nothing; return -1. |
type | 'r': add a row name 'c': add a column name 't': add a text category name 'h': add a title (or a header. 't' is taken). 'v': add (or overwrite) the vector name |
add_me
is NULL
, return -1. apop_name* apop_name_alloc | ( | void | ) |
Allocates a name structure
malloc
fails, return NULL
. Copy one apop_name structure to another. That is, all data is duplicated. Usage:
\param in the input names \return a structure that this function will allocate and fill
int apop_name_find | ( | const apop_name * | n, |
const char * | name, | ||
const char | type | ||
) |
Finds the position of an element in a list of names.
The function uses case-insensitive search (POSIX's strcasecmp
).
n | the apop_name object to search. |
name | the name you seek; see above. |
type | 'c' , 'r' , or 't' . Default is 'c' . |
findme
. If 'c'
, then this may be -1, meaning the vector name. If not found, returns -2. On error, e.g. name==NULL
, returns -2. void apop_name_print | ( | apop_name * | n | ) |
Prints the given list of names to STDOUT. Useful for debugging, and not much else.
n | The apop_name structure |
Append one list of names to another.
Notice that if the first list is empty, then this is a copy function. If the second is NULL
, it is a no-op.
n1 | The first set of names (no default, must not be NULL ) |
nadd | The second set of names, which will be appended after the first. (no default, if NULL , a no-op) |
type1 | Either 'c', 'r', 't', or 'v' stating whether you are merging the columns, rows, or text. If 'v', then ignore typeadd and just overwrite the target vector name with the source name. (default = 'r') |
typeadd | Either 'c', 'r', 't', or 'v' stating whether you are merging the columns, rows, or text. If 'v', then overwrite the target with the source vector name. (default = type1) |
apop_data* apop_rake | ( | char const * | margin_table, |
char *const * | var_list, | ||
int | var_ct, | ||
char const * | all_vars, | ||
char *const * | contrasts, | ||
int | contrast_ct, | ||
char const * | structural_zeros, | ||
int | max_iterations, | ||
double | tolerance, | ||
char const * | count_col, | ||
int | run_number, | ||
char const * | init_table, | ||
char const * | init_count_col, | ||
double | nudge, | ||
char const * | table_name | ||
) |
Fit a log-linear model via iterative proportional fitting, aka raking.
Raking has many uses. The Modeling with Data blog presents a series of discussions of uses of raking, including some worked examples.
Or see Wikipedia for an overview of Log linear models, aka Poisson regressions. One approach toward log-linear modeling is a regression form; let there be four categories, A, B, C, and D, from which we can produce a model positing, for example, that cell count is a function of a form like . In this case, we would assign a separate coefficient to every possible value of A, every possible value of (B, C), and every value of (C, D). Raking is the technique that searches for that large set of parameters.
The combinations of categories that are considered to be relevant are called contrasts, after ANOVA terminology of the 1940s.
The other constraint on the search are structural zeros, which are values that you know can never be non-zero, due to field-specific facts about the variables. For example, U.S. Social Security payments are available only to those age 65 or older, so "age <65 and gets_soc_security=1" is a structural zero.
Because there is one parameter for every combination, there may be millions of parameters to estimate, so the search to find the most likely value requires some attention to technique. For over half a century, the consensus method for searching has been raking, which iteratively draws each category closer to the mean in a somewhat simple manner (this was first developed circa 1940 and had to be feasible by hand), but which is guaranteed to eventually arrive at the maximum likelihood estimate for all cells.
Another complication is that the table is invariably sparse. One can easily construct tables with millions of cells, but the corresponding data set may have only a few thousand observations.
This function uses the database to resolve the sparseness problem. It constructs a query requesting all combinations of categories the could possibly be non-zero after raking, given all of the above constraints. Then, raking is done using only that subset. This means that the work is done on a number of cells proportional to the number of data points, not to the full cross of all categories. Set apop_opts.verbose
to 2 or greater to show the query on stderr
.
.init_table
, then an all-ones default table will be used.apop_opts.verbose=3
to see the intermediate tables at the end of each round of raking.margin_table | The name of the table in the database to use for calculating the margins. The table should have one observation per row. No default. (This used to be called table_name ; that name is now deprecated.) |
var_list | The full list of variables to search. A list of strings, e.g., (char *[]){"var1", "var2", ..., "var15"} |
var_ct | The count of the full list of variables to search. |
all_vars | deprecated. |
contrasts | The contrasts describing your model. Like the all_vars input, each contrast is a pipe-delimited list of variable names. No default. |
contrast_ct | The number of contrasts in the list of contrasts. No default. |
structural_zeros | a SQL clause indicating combinations that can never take a nonzero value. This will go into a where clause, so anything you could put there is OK, e.g. "age <65 and gets_soc_security=1 or age <15 and married=1". Your margin data is not checked for structural zeros. Default: no structural zeros. |
max_iterations | Number of rounds of raking at which the algorithm halts. Default: 1000. |
tolerance | I calculate the change for each cell from round to round; if the largest cell change is smaller than this, I stop. Default: 1e-5. |
count_col | This column gives the count of how many observations are represented by each row. If NULL , ech row represents one person. Default: NULL . |
run_number | Because I write intermediate tables to the database, I need a way to distinguish distinct runs should you be threading several runs at once. If you aren't running several instances simultaneously, don't worry about this; if you are, do supply a value, since it's hard for the function to supply one in a race-proof manner. Default: internally-maintained values. |
init_table | The default is to initially set all table elements to one and then rake from there. This is effectively the `fully synthetic' approach, which uses only the information in the margins and derives the data set closest to the all-ones data set that is consistent with the margins. Care is taken to maintan sparsity in this case. If you specify an init_table , then I will get the initial cell counts from it. Default: the fully-synthetic approach, using a starting point of an all-ones grid. |
init_count_col | The column in init_table with the cell counts. |
nudge | There is a common hack of adding a small value to every zero entry, because a zero entry will always scale to zero, while a small value could eventually scale to anything. Recall that this function works on sparse sets, so I first filter out those cells that could possibly have a nonzero value given the observations, then I add nudge to any zero cells within that subset. |
table_name | Deprecated; replaced with margin_table . |
weights
vector gives the most likely value for each cell. out->error='i' | Input was somehow wrong. |
out->error='c' | Raking did not converge, reached max. iteration count.
|
int apop_regex | ( | const char * | string, |
const char * | regex, | ||
apop_data ** | substrings, | ||
const char | use_case | ||
) |
A convenience function for regular expression searching
For example, "p.val" will match "P value", "p.value", "p values" (and even "tempeval", so be careful).
If you give a non-NULL
address in which to place a table of paren-delimited substrings, I'll return them as a row in the text element of the returned apop_data set. I'll return all the matches, filling the first row with substrings from the first application of your regex, then filling the next row with another set of matches (if any), and so on to the end of the string. Useful when parsing a list of items, for example.
string | The string to search (no default) |
regex | The regular expression (no default) |
substrings | Parens in the regex indicate that I should return matching substrings. Give me the address of an apop_data* set, and I will allocate and fill the text portion with matches. Default= NULL , meaning do not return substrings (even if parens exist in the regex). If no match, return an empty apop_data set, so output->textsize[0]==0 . |
use_case | Should I be case sensitive, 'y' or 'n' ? (default = 'n' , which is not the POSIX default.) |
substrings
may be allocated and filled if needed.apop_opts.stop_on_warning='n'
returns -1 on error (e.g., regex NULL
or didn't compile). strings==NULL
, I return 0—no match—and if substrings
is provided, set it to NULL
.&subs
, not plain subs
. Also, the non-match has a zero-length blank in subs->text[0][1]
.([A-Za-z])([0-9])
, the column zero of outdata
will hold letters, and column one will hold numbers. Use apop_data_transpose to reverse this so that the letters are in outdata->text[0]
and numbers in outdata->text[1]
. double apop_rng_GHgB3 | ( | gsl_rng * | r, |
double * | a | ||
) |
RNG from a Generalized Hypergeometric type B3.
Devroye uses this as the base for many of his distribution-generators, including the Waring.
GSL_NAN
if the function doesn't stop. void apop_settings_copy_group | ( | apop_model * | outm, |
apop_model * | inm, | ||
char * | copyme | ||
) |
Copy a settings group with the given name from the second model to the first. (i.e., the arguments are in memcpy order).
You probably won't need this often—just use apop_model_copy.
outm | The model that will receive a copy of the settings group. |
inm | The model that will provide the original. |
copyme | The string naming the group. For example, for an apop_mcmc_settings group, this would be "apop_mcmc" . |
outm->error=='s' | Error copying settings group. |
int apop_table_exists | ( | char const * | name, |
char | remove | ||
) |
Check for the existence of a table, and maybe delete it.
Recreating a table which already exists can cause errors, so it is good practice to check for existence first. Also, this is the stylish way to delete a table, since just calling "drop table"
will give you an error if the table doesn't exist.
name | the table name (no default) |
remove | 'd' ==>delete table so it can be recreated in main. 'n' ==>no action. Return result so program can continue. (default) |
apop_opts.stop_on_warn='n'
, returns -1 on errors.double apop_test | ( | double | statistic, |
char * | distribution, | ||
double | p1, | ||
double | p2, | ||
char | tail | ||
) |
This is a convenience function to do the lookup of a given statistic along a given distribution. You give me a statistic, its (hypothesized) distribution, and whether to use the upper tail, lower tail, or both. I will return the odds of a Type I error given the model—in statistician jargon, the -value. [Type I error: odds of rejecting the null hypothesis when it is true.]
For example,
will return the density of the standard Normal distribution that is more than 1.3 from zero. If this function returns a small value, we can be confident that the statistic is significant. Or,
will give the appropriate odds for an upper-tailed test using the -distribution with 10 degrees of freedom (e.g., a
-test of the null hypothesis that the statistic is less than or equal to zero).
Several more distributions are supported; see below.
statistic | The scalar value to be tested. |
distribution | The name of the distribution; see below. |
p1 | The first parameter for the distribution; see below. |
p2 | The second parameter for the distribution; see below. |
tail | 'u' = upper tail; 'l' = lower tail; anything else = two-tailed. (default = two-tailed) |
Here is a list of distributions you can use, and their parameters.
"normal"
or "gaussian"
"lognormal"
"uniform"
"t"
"chi squared"
, "chi"
, "chisq"
:
"f"
Run the Fisher exact test on an input contingency table.
out->error=='p' | Processing error in the test. |
int apop_text_add | ( | apop_data * | in, |
const size_t | row, | ||
const size_t | col, | ||
const char * | fmt, | ||
... | |||
) |
Add a string to the text element of an apop_data set. If you send me a NULL
string, I will write the value of apop_opts.nan_string
in the given slot. If there is already something in that slot, that string is freed, preventing memory leaks.
in | The apop_data set, that already has an allocated text element. |
row | The row |
col | The col |
fmt | The text to write. |
... | You can use a printf-style fmt and follow it with the usual variables to fill in. |
asprintf
), not a pointer to the input(s). asprintf(&(your_dataset->text[row][col]), "your string")
.This allocates an array of strings and puts it in the text
element of an apop_data set.
If the text
element already exists, then this is effectively a realloc
function, reshaping to the size you specify.
in | An apop_data set. It's OK to send in NULL , in which case an apop_data set with NULL matrix and vector elements is returned. |
row | the number of rows of text. |
col | the number of columns of text. |
NULL
, then this is a repeat of the input pointer. out->error=='a' | Allocation error. |
void apop_text_free | ( | char *** | freeme, |
int | rows, | ||
int | cols | ||
) |
Free a matrix of chars* (i.e., a char***). This is the form of the text element of the apop_data set, so you can use this for:
This is what apop_data_free
uses internally.
char* apop_text_paste | ( | apop_data const * | strings, |
char * | between, | ||
char * | before, | ||
char * | after, | ||
char * | between_cols, | ||
apop_fn_riip | prune, | ||
void * | prune_parameter | ||
) |
Join together a list or array of strings, with optional separators between the strings.
strings | An apop_data set with a grid of text to be combined into a single string |
between | The text to put in between the rows of the table, such as ", ". (Default is a single space: " ") |
before | The text to put at the head of the string. For the query example, this would be .before="select " . (Default: NULL) |
after | The text to put at the tail of the string. For the query example, .after=" from data_table" . (Default: NULL) |
between_cols | The text to insert between columns of text. See below for an example (Default is set to equal .between ) |
prune | If you don't want to use the entire text set, you can provide a function to indicate which elements should be pruned out. Some examples: |
prune_parameter | A void pointer to pass to your prune function. |
strings
table joined as per your specification. Allocated by the function, to be freed by you if desired.NULL
or has no text, I will print only the .before
and .after
parts with nothing in between. apop_opts.verbose >=3
, then print the pasted text to stderr. The sample snippet generates the SQL for a query using a list of column names (where the query begins with select
, ends with from datatab
, and has commas in between each element), re-processes the same list to produce the head of an HTML table, then produces the body of the table with the query result (pasting the tr
s and td
s into the right places).
Give me a column of text, and I'll give you a sorted list of the unique elements. This is basically running "select distinct * from datacolumn", but without the aid of the database.
d | An apop_data set with a text component |
col | The text column you want me to use. |
apop_model* apop_update | ( | apop_data * | data, |
apop_model * | prior, | ||
apop_model * | likelihood, | ||
gsl_rng * | rng | ||
) |
Take in a prior and likelihood distribution, and output a posterior distribution.
This function first checks a table of conjugate distributions for the pair you sent in. If the names match the table, then the function returns a closed-form model with updated parameters. If the parameters aren't in the table of conjugate priors/likelihoods, then it uses Markov Chain Monte Carlo to sample from the posterior distribution, and then outputs a histogram model for further analysis. Notably, the histogram can be used as the input to this function, so you can chain Bayesian updating procedures.
p
or log_likelihood
element, then I use apop_model_metropolis to generate the posterior.p
or log_likelihood
but does have a draw
element, then I make draws from the prior and weight them by the p
given by the likelihood distribution. This is not a rejection sampling method, so the burnin is ignored.Here are the conjugate distributions currently defined:
Prior | Likelihood | Notes |
Beta | Binomial | |
Beta | Bernoulli | |
Exponential | Gamma | Gamma likelihood represents the distribution of ![]() ![]() |
Normal | Normal | Assumes prior with fixed ![]() ![]() |
Gamma | Poisson | Uses sum and size of the data |
data | The input data, that will be used by the likelihood function (default = NULL .) |
prior | The prior apop_model. If the system needs to estimate the posterior via MCMC, this needs to have a log_likelihood or p method. (No default, must not be NULL .) |
likelihood | The likelihood apop_model. If the system needs to estimate the posterior via MCMC, this needs to have a log_likelihood or p method (ll preferred). (No default, must not be NULL .) |
rng | A gsl_rng , already initialized (e.g., via apop_rng_alloc). (default: an RNG from apop_rng_get_thread) |
Here is a test function that compares the output via conjugate table and via Metropolis-Hastings sampling:
double apop_vector_cov | ( | const gsl_vector * | v1, |
const gsl_vector * | v2, | ||
const gsl_vector * | weights | ||
) |
Find the sample covariance of a pair of vectors, with an optional weighting. This only makes sense if the weightings are identical, so the function takes only one weighting vector for both.
v1,v2 | The data vectors |
weights | The weight vector. Default: equal weights for all elements. |
double apop_vector_mean | ( | gsl_vector const * | v, |
gsl_vector const * | weights | ||
) |
Find the mean, weighted or unweighted.
v | The data vector |
weights | The weight vector. Default: assume equal weights. |
int gsl_vector* apop_vector_moving_average | ( | gsl_vector * | v, |
size_t | bandwidth | ||
) |
Return a new vector that is the moving average of the input vector.
v | The input vector, unsmoothed |
bandwidth | The number of elements to be smoothed. |
void apop_vector_normalize | ( | gsl_vector * | in, |
gsl_vector ** | out, | ||
const char | normalization_type | ||
) |
This function will normalize a vector, either such that it has mean zero and variance one, or ranges between zero and one, or sums to one.
in | A gsl_vector which you have already allocated and filled. NULL input gives NULL output. (No default) |
out | If normalizing in place, NULL . If not, the address of a gsl_vector . Do not allocate. (default = NULL .) |
normalization_type | 'p': normalized vector will sum to one. E.g., start with a set of observations in bins, end with the percentage of observations in each bin. (the default) 'r': normalized vector will range between zero and one. Replace each X with (X-min) / (max - min). 's': normalized vector will have mean zero and variance one. Replace each X with ![]() ![]() 'm': normalize to mean zero: Replace each X with ![]() |
Example
double* apop_vector_percentiles | ( | gsl_vector * | data, |
char | rounding | ||
) |
Returns an array of size 101, where returned_vector
[95] gives the value of the 95th percentile, for example. Returned_vector
[100] is always the maximum value, and returned_vector
[0] is always the min (regardless of rounding rule).
data | a gsl_vector of data. (No default, must not be NULL .) |
rounding | This will either be 'u' , 'd' , or 'a' . Unless your data is exactly a multiple of 101, some percentiles will be ambiguous. If 'u' , then round up (use the next highest value); if 'd' (or anything else), round down to the next lowest value; if 'a' , take the mean of the two nearest points. If 'u' or 'a' , then you can say "5% or more of the sample is below \c returned_vector[5]"; if 'd' or 'a' , then you can say "5% or more of the sample is above returned_vector[5]". (Default = 'd' .) |
free()
the array returned by this function. gsl_vector* apop_vector_realloc | ( | gsl_vector * | v, |
size_t | newheight | ||
) |
This function will resize a gsl_vector
to a new length.
Data in the vector will be retained. If the new height is smaller than the old, then data at the end of the vector will be cropped away (in a non–memory-leaking manner). If the new height is larger than the old, then new cells will be filled with garbage; it is your responsibility to zero out or otherwise fill them before use.
Warning I: Using this function is basically bad form—especially when used in a for
loop that adds an element each time. A large number of realloc
s can take a noticeable amount of time. You are thus encouraged to make an effort to determine the size of your data beforehand.
Warning II: The gsl_vector
is a versatile struct that can represent subvectors, matrix columns and other cuts from parent data. I can't deal with those, and check for such situations beforehand. [Besides, resizing a portion of a parent matrix makes no sense.]
v | The already-allocated vector to resize. If you give me NULL , this is equivalent to gsl_vector_alloc |
newheight | The height you'd like the vector to be. |
apop_data* apop_vector_to_data | ( | gsl_vector * | v | ) |
Deprecated; please do not use. Just use a compound literal, as in the code sample in the documentation for apop_matrix_to_data.
gsl_vector* apop_vector_unique_elements | ( | const gsl_vector * | v | ) |
Give me a vector of numbers, and I'll give you a sorted list of the unique elements. This is basically running "select distinct datacol from data order by datacol", but without the aid of the database.
v | a vector of items |
double apop_vector_var | ( | gsl_vector const * | v, |
gsl_vector const * | weights | ||
) |
Find the sample variance of a vector, weighted or unweighted.
v | The data vector |
weights | The weight vector. If NULL, assume equal weights. |
w->size
as the number of elements, and returns the usual sum over apop_opts_type apop_opts |
Here are where the options are initially set. See the apop_opts_type documentation for details.