Patterns in static

Apophenia

Data Structures | Macros | Typedefs | Functions | Variables
apop.h File Reference

Go to the source code of this file.

Data Structures

struct  apop_name
 
struct  apop_data
 
struct  apop_settings_type
 
struct  apop_model
 
struct  apop_opts_type
 
struct  apop_mle_settings
 
struct  apop_lm_settings
 
struct  apop_parts_wanted_settings
 
struct  apop_cdf_settings
 
struct  apop_pm_settings
 
struct  apop_pmf_settings
 
struct  apop_kernel_density_settings
 
struct  apop_mcmc_proposal_s
 
struct  apop_mcmc_settings
 
struct  loess_struct
 
struct  apop_loess_settings
 
struct  apop_arms_settings
 
struct  apop_stack_settings
 
struct  apop_ct_settings
 
struct  apop_dconstrain_settings
 
struct  apop_composition_settings
 
struct  apop_mixture_settings
 

Macros

#define _GNU_SOURCE
 
#define apop_varad_head(type, name)   type variadic_##name(variadic_type_##name varad_in)
 
#define apop_varad_declare(type, name,...)
 
#define apop_varad_var(name, value)   name = varad_in.name ? varad_in.name : (value);
 
#define apop_varad_link(name,...)   variadic_##name((variadic_type_##name) {__VA_ARGS__})
 
#define apop_data_add_names(dataset, type,...)   apop_data_add_names_base((dataset), (type), (char const*[]) {__VA_ARGS__, NULL})
 
#define apop_data_free(freeme)   (apop_data_free_base(freeme) ? 0 : ((freeme)= NULL))
 
#define apop_data_prune_columns(in,...)
 
#define apop_line_to_vector   apop_array_to_vector
 
#define apop_vector_fill(avfin,...)   apop_vector_fill_base((avfin), (double []) {__VA_ARGS__})
 
#define apop_data_fill(adfin,...)   apop_data_fill_base((adfin), (double []) {__VA_ARGS__})
 
#define apop_text_fill(dataset,...)   apop_text_fill_base((dataset), (char* []) {__VA_ARGS__, NULL})
 
#define apop_data_falloc(sizes,...)   apop_data_fill(apop_data_alloc sizes, __VA_ARGS__)
 
#define apop_gaussian
 
#define apop_OLS   apop_ols
 
#define apop_PMF   apop_pmf
 
#define apop_F_distribution   apop_f_distribution
 
#define apop_WLS   apop_wls
 
#define apop_IV   apop_iv
 
#define apop_model_set_parameters(in,...)
 
#define apop_model_mixture(...)   apop_model_mixture_base((apop_model *[]){__VA_ARGS__, NULL})
 
#define apop_model_stack(...)   apop_model_stack_base((apop_model *[]){__VA_ARGS__, NULL})
 
#define apop_ANOVA   apop_anova
 
#define apop_F_test   apop_f_test
 
#define apop_estimate_r_squared(in)
 
#define apop_rng_get_thread(thread_in)
 
#define apop_update_hash(m1, m2)
 
#define apop_update_vtable_add(fn,...)   apop_update_type_check(fn), apop_vtable_add("apop_update", fn, apop_update_hash(__VA_ARGS__))
 
#define apop_update_vtable_get(...)   apop_vtable_get("apop_update", apop_update_hash(__VA_ARGS__))
 
#define apop_update_vtable_drop(...)   apop_vtable_drop("apop_update", apop_update_hash(__VA_ARGS__))
 
#define apop_score_hash(m1)   ((size_t)((m1)->log_likelihood ? (m1)->log_likelihood : (m1)->p))
 
#define apop_score_vtable_add(fn,...)   apop_score_type_check(fn), apop_vtable_add("apop_score", fn, apop_score_hash(__VA_ARGS__))
 
#define apop_score_vtable_get(...)   apop_vtable_get("apop_score", apop_score_hash(__VA_ARGS__))
 
#define apop_score_vtable_drop(...)   apop_vtable_drop("apop_score", apop_score_hash(__VA_ARGS__))
 
#define apop_parameter_model_hash(m1)   ((size_t)((m1)->log_likelihood ? (m1)->log_likelihood : (m1)->p)*33 + (m1)->estimate ? (size_t)(m1)->estimate: 27)
 
#define apop_parameter_model_vtable_add(fn,...)   apop_parameter_model_type_check(fn), apop_vtable_add("apop_parameter_model", fn, apop_parameter_model_hash(__VA_ARGS__))
 
#define apop_parameter_model_vtable_get(...)   apop_vtable_get("apop_parameter_model", apop_parameter_model_hash(__VA_ARGS__))
 
#define apop_parameter_model_vtable_drop(...)   apop_vtable_drop("apop_parameter_model", apop_parameter_model_hash(__VA_ARGS__))
 
#define apop_predict_hash(m1)   ((size_t)((m1)->log_likelihood ? (m1)->log_likelihood : (m1)->p)*33 + (m1)->estimate ? (size_t)(m1)->estimate: 27)
 
#define apop_predict_vtable_add(fn,...)   apop_predict_type_check(fn), apop_vtable_add("apop_predict", fn, apop_predict_hash(__VA_ARGS__))
 
#define apop_predict_vtable_get(...)   apop_vtable_get("apop_predict", apop_predict_hash(__VA_ARGS__))
 
#define apop_predict_vtable_drop(...)   apop_vtable_drop("apop_predict", apop_predict_hash(__VA_ARGS__))
 
#define apop_model_print_hash(m1)
 
#define apop_model_print_vtable_add(fn,...)   apop_model_print_type_check(fn), apop_vtable_add("apop_model_print", fn, apop_model_print_hash(__VA_ARGS__))
 
#define apop_model_print_vtable_get(...)   apop_vtable_get("apop_model_print", apop_model_print_hash(__VA_ARGS__))
 
#define apop_model_print_vtable_drop(...)   apop_vtable_drop("apop_model_print", apop_model_print_hash(__VA_ARGS__))
 
#define apop_test_ANOVA_independence(d)   apop_test_anova_independence(d)
 
#define Apop_notify(verbosity,...)
 
#define Apop_maybe_abort(level)
 
#define Apop_stopif(test, onfail, level,...)
 
#define apop_errorlevel   -5
 
#define apop_return_data_error(E)   {apop_data *out=apop_data_alloc(); out->error='E'; return out;}
 
#define Apop_assert_c(test, returnval, level,...)   Apop_stopif(!(test), return returnval, level, __VA_ARGS__)
 
#define Apop_assert(test,...)   Apop_assert_c((test), 0, apop_errorlevel, __VA_ARGS__)
 
#define Apop_assert_n(test,...)   Apop_assert_c((test), , apop_errorlevel, __VA_ARGS__)
 
#define Apop_assert_nan(test,...)   Apop_assert_c((test), GSL_NAN, apop_errorlevel, __VA_ARGS__)
 
#define Apop_assert_negone(test,...)   Apop_assert_c((test), -1, apop_errorlevel, __VA_ARGS__)
 
#define apop_ml_imputation(d, m)   apop_ml_impute(d, m)
 
#define APOP_SUBMATRIX(m, srow, scol, nrows, ncols, o)
 
#define Apop_row_v(m, row, v)   Apop_matrix_row((m)->matrix, row, v)
 
#define Apop_col_v(m, col, v)
 
#define Apop_rows(d, rownum, len, outd)
 
#define Apop_row(d, row, outd)
 
#define Apop_cols(d, colnum, len, outd)
 
#define Apop_row_tv(m, row, v)
 
#define Apop_col_tv(m, col, v)
 
#define Apop_row_t(d, rowname, outd)
 
#define Apop_col_t(d, colname, outd)
 
#define Apop_subm(data_to_view, srow, scol, nrows, ncols)
 
#define Apop_rv(data_to_view, row)
 
#define Apop_cv(data_to_view, col)
 
#define apop_subvector(v, start, len)
 
#define apop_mrow(m, row)
 
#define Apop_rs(d, rownum, len)
 
#define Apop_cs(d, colnum, len)
 
#define Apop_r(d, rownum)
 
#define Apop_c(d, col)
 
#define APOP_COL   Apop_col
 
#define apop_col   Apop_col
 
#define APOP_COLS   Apop_cols
 
#define apop_cols   Apop_cols
 
#define APOP_COL_T   Apop_col_t
 
#define apop_col_t   Apop_col_t
 
#define APOP_COL_TV   Apop_col_tv
 
#define apop_col_tv   Apop_col_tv
 
#define APOP_COL_V   Apop_col_v
 
#define apop_col_v   Apop_col_v
 
#define APOP_ROW   Apop_row
 
#define apop_row   Apop_row
 
#define Apop_data_row   Apop_row #deprecated
 
#define APOP_ROWS   Apop_rows
 
#define apop_rows   Apop_rows
 
#define APOP_ROW_T   Apop_row_t
 
#define apop_row_t   Apop_row_t
 
#define APOP_ROW_TV   Apop_row_tv
 
#define apop_row_tv   Apop_row_tv
 
#define APOP_ROW_V   Apop_row_v
 
#define apop_row_v   Apop_row_v
 
#define Apop_matrix_row(m, row, v)
 
#define Apop_matrix_col(m, col, v)
 
#define Apop_submatrix   APOP_SUBMATRIX
 
#define APOP_MATRIX_ROW   Apop_matrix_row
 
#define apop_matrix_row   Apop_matrix_row
 
#define APOP_MATRIX_COL   Apop_matrix_col
 
#define apop_matrix_col   Apop_matrix_col
 
#define apop_sum(in)   apop_vector_sum(in)
 
#define apop_var(in)   apop_vector_var(in)
 
#define apop_mean(in)
 
#define Apop_settings_get_group(m, type)
 
#define Apop_settings_rm_group(m, type)
 
#define Apop_settings_add_group(model, type,...)
 
#define apop_model_copy_set(model, type,...)
 
#define Apop_settings_get(model, type, setting)
 
#define Apop_settings_set(model, type, setting, data)
 
#define Apop_settings_declarations(ysg)
 
#define Apop_settings_init(name,...)
 
#define Apop_varad_set(var, value)   (out)->var = (in).var ? (in).var : (value);
 
#define Apop_settings_copy(name,...)
 
#define Apop_settings_free(name,...)
 
#define apop_model_coordinate_transform(...)   Apop_model_copy_set(apop_coordinate_transform, apop_ct, __VA_ARGS__)
 
#define apop_model_dcompose(...)   Apop_model_copy_set(apop_composition, apop_composition, __VA_ARGS__)
 
#define apop_model_dconstrain(...)   Apop_model_copy_set(apop_dconstrain, apop_dconstrain, __VA_ARGS__)
 

Typedefs

typedef struct apop_data apop_data
 
typedef struct apop_model apop_model
 
typedef apop_model *(* apop_update_type )(apop_data *, apop_model *, apop_model *)
 
typedef void(* apop_score_type )(apop_data *d, gsl_vector *gradient, apop_model *params)
 
typedef apop_model *(* apop_parameter_model_type )(apop_data *, apop_model *)
 
typedef apop_data *(* apop_predict_type )(apop_data *d, apop_model *params)
 
typedef void(* apop_model_print_type )(apop_model *params, FILE *out)
 
typedef struct apop_mcmc_proposal_s apop_mcmc_proposal_s
 
typedef struct apop_mcmc_settings apop_mcmc_settings
 

Functions

apop_nameapop_name_alloc (void)
 
int apop_name_add (apop_name *n, char const *add_me, char type)
 
void apop_name_free (apop_name *free_me)
 
void apop_name_print (apop_name *n)
 
void apop_name_stack (apop_name *n1, apop_name *nadd, char type1, char typeadd)
 
apop_nameapop_name_copy (apop_name *in)
 
int apop_name_find (const apop_name *n, const char *findme, const char type)
 
void apop_data_add_names_base (apop_data *d, const char type, char const **names)
 
char apop_data_free_base (apop_data *freeme)
 
apop_dataapop_matrix_to_data (gsl_matrix *m)
 
apop_dataapop_vector_to_data (gsl_vector *v)
 
apop_dataapop_data_alloc (const size_t size1, const size_t size2, const int size3)
 
apop_dataapop_data_calloc (const size_t size1, const size_t size2, const int size3)
 
apop_dataapop_data_stack (apop_data *m1, apop_data *m2, char posn, char inplace)
 
apop_data ** apop_data_split (apop_data *in, int splitpoint, char r_or_c)
 
apop_dataapop_data_copy (const apop_data *in)
 
void apop_data_rm_columns (apop_data *d, int *drop)
 
void apop_data_memcpy (apop_data *out, const apop_data *in)
 
double * apop_data_ptr (apop_data *data, int row, int col, const char *rowname, const char *colname, const char *page)
 
double apop_data_get (const apop_data *data, size_t row, int col, const char *rowname, const char *colname, const char *page)
 
int apop_data_set (apop_data *data, size_t row, int col, const double val, const char *rowname, const char *colname, const char *page)
 
void apop_data_add_named_elmt (apop_data *d, char *name, double val)
 
int apop_text_add (apop_data *in, const size_t row, const size_t col, const char *fmt,...)
 
apop_dataapop_text_alloc (apop_data *in, const size_t row, const size_t col)
 
void apop_text_free (char ***freeme, int rows, int cols)
 
apop_dataapop_data_transpose (apop_data *in, char transpose_text, char inplace)
 
gsl_matrix * apop_matrix_realloc (gsl_matrix *m, size_t newheight, size_t newwidth)
 
gsl_vector * apop_vector_realloc (gsl_vector *v, size_t newheight)
 
apop_dataapop_data_prune_columns_base (apop_data *d, char **colnames)
 
apop_dataapop_data_get_page (const apop_data *data, const char *title, const char match)
 
apop_dataapop_data_add_page (apop_data *dataset, apop_data *newpage, const char *title)
 
apop_dataapop_data_rm_page (apop_data *data, const char *title, const char free_p)
 
apop_dataapop_data_rm_rows (apop_data *in, int *drop, int(*do_drop)(apop_data *, void *), void *drop_parameter)
 
apop_dataapop_model_draws (apop_model *model, int count, apop_data *draws)
 
gsl_vector * apop_vector_copy (const gsl_vector *in)
 
gsl_matrix * apop_vector_to_matrix (const gsl_vector *in, char row_col)
 
gsl_matrix * apop_matrix_copy (const gsl_matrix *in)
 
apop_dataapop_db_to_crosstab (char *tabname, char *r1, char *r2, char *datacol)
 
gsl_vector * apop_array_to_vector (double *in, int size)
 
apop_dataapop_text_to_data (char const *text_file, int has_row_names, int has_col_names, int const *field_ends, char const *delimiters)
 
int apop_text_to_db (char const *text_file, char *tabname, int has_row_names, int has_col_names, char **field_names, int const *field_ends, apop_data *field_params, char *table_params, char const *delimiters, char if_table_exists)
 
apop_dataapop_data_rank_expand (apop_data *in)
 
apop_dataapop_data_rank_compress (apop_data *in)
 
void apop_crosstab_to_db (apop_data *in, char *tabname, char *row_col_name, char *col_col_name, char *data_col_name)
 
gsl_vector * apop_data_pack (const apop_data *in, gsl_vector *out, char all_pages, char use_info_pages)
 
void apop_data_unpack (const gsl_vector *in, apop_data *d, char use_info_pages)
 
apop_dataapop_data_fill_base (apop_data *in, double[])
 
gsl_vector * apop_vector_fill_base (gsl_vector *in, double[])
 
apop_dataapop_text_fill_base (apop_data *data, char *text[])
 
int apop_data_set_row (apop_data *row, apop_data *d, int row_number)
 
void apop_model_free (apop_model *free_me)
 
void apop_model_print (apop_model *print_me, FILE *out)
 
void apop_model_show (apop_model *print_me)
 
apop_modelapop_model_copy (apop_model *in)
 
apop_modelapop_model_clear (apop_data *data, apop_model *model)
 
apop_modelapop_estimate (apop_data *d, apop_model *m)
 
void apop_score (apop_data *d, gsl_vector *out, apop_model *m)
 
double apop_log_likelihood (apop_data *d, apop_model *m)
 
double apop_p (apop_data *d, apop_model *m)
 
double apop_cdf (apop_data *d, apop_model *m)
 
int apop_draw (double *out, gsl_rng *r, apop_model *m)
 
void apop_prep (apop_data *d, apop_model *m)
 
apop_modelapop_parameter_model (apop_data *d, apop_model *m)
 
apop_dataapop_predict (apop_data *d, apop_model *m)
 
apop_modelapop_beta_from_mean_var (double m, double v)
 
apop_modelapop_model_set_parameters_base (apop_model *in, double ap[])
 
apop_modelapop_model_mixture_base (apop_model **inlist)
 
apop_modelapop_model_stack_base (apop_model *mlist[])
 
apop_dataapop_map (apop_data *in, double(*fn_d)(double), double(*fn_v)(gsl_vector *), double(*fn_r)(apop_data *), double(*fn_dp)(double, void *), double(*fn_vp)(gsl_vector *, void *), double(*fn_rp)(apop_data *, void *), double(*fn_dpi)(double, void *, int), double(*fn_vpi)(gsl_vector *, void *, int), double(*fn_rpi)(apop_data *, void *, int), double(*fn_di)(double, int), double(*fn_vi)(gsl_vector *, int), double(*fn_ri)(apop_data *, int), void *param, int inplace, char part, int all_pages)
 
double apop_map_sum (apop_data *in, double(*fn_d)(double), double(*fn_v)(gsl_vector *), double(*fn_r)(apop_data *), double(*fn_dp)(double, void *), double(*fn_vp)(gsl_vector *, void *), double(*fn_rp)(apop_data *, void *), double(*fn_dpi)(double, void *, int), double(*fn_vpi)(gsl_vector *, void *, int), double(*fn_rpi)(apop_data *, void *, int), double(*fn_di)(double, int), double(*fn_vi)(gsl_vector *, int), double(*fn_ri)(apop_data *, int), void *param, char part, int all_pages)
 
gsl_vector * apop_matrix_map (const gsl_matrix *m, double(*fn)(gsl_vector *))
 
gsl_vector * apop_vector_map (const gsl_vector *v, double(*fn)(double))
 
void apop_matrix_apply (gsl_matrix *m, void(*fn)(gsl_vector *))
 
void apop_vector_apply (gsl_vector *v, void(*fn)(double *))
 
gsl_matrix * apop_matrix_map_all (const gsl_matrix *in, double(*fn)(double))
 
void apop_matrix_apply_all (gsl_matrix *in, void(*fn)(double *))
 
double apop_vector_map_sum (const gsl_vector *in, double(*fn)(double))
 
double apop_matrix_map_sum (const gsl_matrix *in, double(*fn)(gsl_vector *))
 
double apop_matrix_map_all_sum (const gsl_matrix *in, double(*fn)(double))
 
void apop_plot_histogram (gsl_vector *data, size_t bin_count, char *with, char const *output_name, FILE *output_pipe, char output_type, char output_append)
 
void apop_matrix_print (const gsl_matrix *data, char const *output_name, FILE *output_pipe, char output_type, char output_append)
 
void apop_vector_print (gsl_vector *data, char const *output_name, FILE *output_pipe, char output_type, char output_append)
 
void apop_data_print (const apop_data *data, char const *output_name, FILE *output_pipe, char output_type, char output_append)
 
void apop_matrix_show (const gsl_matrix *data)
 
void apop_vector_show (const gsl_vector *data)
 
void apop_data_show (const apop_data *data)
 
double apop_vector_mean (gsl_vector const *v, gsl_vector const *weights)
 
double apop_vector_var (gsl_vector const *v, gsl_vector const *weights)
 
double apop_vector_skew_pop (gsl_vector const *v, gsl_vector const *weights)
 
double apop_vector_kurtosis_pop (gsl_vector const *v, gsl_vector const *weights)
 
double apop_vector_cov (gsl_vector const *v1, gsl_vector const *v2, gsl_vector const *weights)
 
double apop_vector_distance (const gsl_vector *ina, const gsl_vector *inb, const char metric, const double norm)
 
void apop_vector_normalize (gsl_vector *in, gsl_vector **out, const char normalization_type)
 
void apop_matrix_normalize (gsl_matrix *data, const char row_or_col, const char normalization)
 
apop_dataapop_data_covariance (const apop_data *in)
 
apop_dataapop_data_correlation (const apop_data *in)
 
long double apop_matrix_sum (const gsl_matrix *m)
 
double apop_matrix_mean (const gsl_matrix *data)
 
void apop_matrix_mean_and_var (const gsl_matrix *data, double *mean, double *var)
 
apop_dataapop_data_summarize (apop_data *data)
 
apop_dataapop_test_fisher_exact (apop_data *intab)
 
int apop_matrix_is_positive_semidefinite (gsl_matrix *m, char semi)
 
double apop_matrix_to_positive_semidefinite (gsl_matrix *m)
 
long double apop_multivariate_gamma (double a, int p)
 
long double apop_multivariate_lngamma (double a, int p)
 
apop_dataapop_t_test (gsl_vector *a, gsl_vector *b)
 
apop_dataapop_paired_t_test (gsl_vector *a, gsl_vector *b)
 
apop_dataapop_anova (char *table, char *data, char *grouping1, char *grouping2)
 
apop_dataapop_f_test (apop_model *est, apop_data *contrast)
 
apop_dataapop_text_unique_elements (const apop_data *d, size_t col)
 
gsl_vector * apop_vector_unique_elements (const gsl_vector *v)
 
apop_dataapop_data_to_factors (apop_data *data, char intype, int incol, int outcol)
 
apop_dataapop_data_get_factor_names (apop_data *data, int col, char type)
 
apop_dataapop_data_to_dummies (apop_data *d, int col, char type, int keep_first, char append, char remove)
 
double apop_kl_divergence (apop_model *from, apop_model *to, int draw_ct, gsl_rng *rng)
 
apop_dataapop_estimate_coefficient_of_determination (apop_model *)
 
void apop_estimate_parameter_tests (apop_model *est)
 
apop_dataapop_jackknife_cov (apop_data *data, apop_model *model)
 
apop_dataapop_bootstrap_cov (apop_data *data, apop_model *model, gsl_rng *rng, int iterations, char keep_boots, char ignore_nans)
 
gsl_rng * apop_rng_alloc (int seed)
 
double apop_rng_GHgB3 (gsl_rng *r, double *a)
 
gsl_rng * apop_rng_get_thread_base (int thread)
 
int apop_arms_draw (double *out, gsl_rng *r, apop_model *m)
 Adaptive rejection metropolis sampling. More...
 
gsl_vector * apop_numerical_gradient (apop_data *data, apop_model *model, double delta)
 
apop_dataapop_model_hessian (apop_data *data, apop_model *model, double delta)
 
apop_dataapop_model_numerical_covariance (apop_data *data, apop_model *model, double delta)
 
void apop_maximum_likelihood (apop_data *data, apop_model *dist)
 
apop_modelapop_estimate_restart (apop_model *e, apop_model *copy, char *starting_pt, double boundary)
 
long double apop_linear_constraint (gsl_vector *beta, apop_data *constraint, double margin)
 
apop_modelapop_model_fix_params (apop_model *model_in)
 
apop_modelapop_model_fix_params_get_base (apop_model *model_in)
 
int apop_vtable_add (char const *tabname, void *fn_in, unsigned long hash)
 
void * apop_vtable_get (char const *tabname, unsigned long hash)
 
int apop_vtable_drop (char const *tabname, unsigned long hash)
 
void apop_update_type_check (apop_update_type in)
 
void apop_score_type_check (apop_score_type in)
 
void apop_parameter_model_type_check (apop_parameter_model_type in)
 
void apop_predict_type_check (apop_predict_type in)
 
void apop_model_print_type_check (apop_model_print_type in)
 
double apop_generalized_harmonic (int N, double s) __attribute__((__pure__))
 
apop_dataapop_test_anova_independence (apop_data *d)
 
int apop_regex (const char *string, const char *regex, apop_data **substrings, const char use_case)
 
int apop_system (const char *fmt,...) __attribute__((format(printf
 
int gsl_vector * apop_vector_moving_average (gsl_vector *, size_t)
 
apop_dataapop_histograms_test_goodness_of_fit (apop_model *h0, apop_model *h1)
 
apop_dataapop_test_kolmogorov (apop_model *m1, apop_model *m2)
 
apop_dataapop_data_pmf_compress (apop_data *in)
 
apop_dataapop_data_to_bins (apop_data *indata, apop_data *binspec, int bin_count, char close_top_bin)
 
apop_modelapop_model_to_pmf (apop_model *model, apop_data *binspec, long int draws, int bin_count, gsl_rng *rng)
 
char * apop_text_paste (apop_data const *strings, char *between, char *before, char *after, char *between_cols, int(*prune)(apop_data *, int, int, void *), void *prune_parameter)
 
apop_dataapop_data_listwise_delete (apop_data *d, char inplace)
 
apop_modelapop_ml_impute (apop_data *d, apop_model *meanvar)
 
apop_modelapop_model_metropolis (apop_data *d, gsl_rng *rng, apop_model *m)
 
apop_modelapop_update (apop_data *data, apop_model *prior, apop_model *likelihood, gsl_rng *rng)
 
double apop_test (double statistic, char *distribution, double p1, double p2, char tail)
 
double * apop_vector_percentiles (gsl_vector *data, char rounding)
 
apop_dataapop_data_sort (apop_data *data, apop_data *sort_order, char asc, char inplace, double *col_order)
 
apop_dataapop_rake (char const *margin_table, char *const *var_list, int var_ct, char const *all_vars, char *const *contrasts, int contrast_ct, char const *structural_zeros, int max_iterations, double tolerance, char const *count_col, int run_number, char const *init_table, char const *init_count_col, double nudge, char const *table_name)
 
double apop_det_and_inv (const gsl_matrix *in, gsl_matrix **out, int calc_det, int calc_inv)
 
apop_dataapop_dot (const apop_data *d1, const apop_data *d2, char form1, char form2)
 
int apop_vector_bounded (const gsl_vector *in, long double max)
 
gsl_matrix * apop_matrix_inverse (const gsl_matrix *in)
 
double apop_matrix_determinant (const gsl_matrix *in)
 
apop_dataapop_matrix_pca (gsl_matrix *data, int const dimensions_we_want)
 
gsl_vector * apop_vector_stack (gsl_vector *v1, gsl_vector *v2, char inplace)
 
gsl_matrix * apop_matrix_stack (gsl_matrix *m1, gsl_matrix *m2, char posn, char inplace)
 
gsl_matrix * apop_matrix_rm_columns (gsl_matrix *in, int *drop)
 
void apop_vector_log (gsl_vector *v)
 
void apop_vector_log10 (gsl_vector *v)
 
void apop_vector_exp (gsl_vector *v)
 
long double apop_vector_sum (const gsl_vector *in)
 
double apop_vector_var_m (const gsl_vector *in, const double mean)
 
double apop_vector_correlation (const gsl_vector *ina, const gsl_vector *inb)
 
double apop_vector_kurtosis (const gsl_vector *in)
 
double apop_vector_skew (const gsl_vector *in)
 
int apop_table_exists (char const *name, char remove)
 
int apop_db_open (char const *filename)
 
int apop_db_close (char vacuum)
 
int apop_query (const char *q,...) __attribute__((format(printf
 
int gsl_matrix * apop_query_to_matrix (const char *fmt,...) __attribute__((format(printf
 
int gsl_matrix apop_dataapop_query_to_text (const char *fmt,...) __attribute__((format(printf
 
int gsl_matrix apop_data
apop_data
apop_query_to_data (const char *fmt,...) __attribute__((format(printf
 
int gsl_matrix apop_data
apop_data apop_data
apop_query_to_mixed_data (const char *typelist, const char *fmt,...) __attribute__((format(printf
 
int gsl_matrix apop_data
apop_data apop_data gsl_vector * 
apop_query_to_vector (const char *fmt,...) __attribute__((format(printf
 
int gsl_matrix apop_data
apop_data apop_data gsl_vector
double 
apop_query_to_float (const char *fmt,...) __attribute__((format(printf
 
int gsl_matrix apop_data
apop_data apop_data gsl_vector
double int 
apop_data_to_db (const apop_data *set, const char *tabname, char)
 
void * apop_settings_get_grp (apop_model *m, char *type, char fail)
 
void apop_settings_remove_group (apop_model *m, char *delme)
 
void apop_settings_copy_group (apop_model *outm, apop_model *inm, char *copyme)
 
void * apop_settings_group_alloc (apop_model *model, char *type, void *free_fn, void *copy_fn, void *the_group)
 
apop_modelapop_settings_group_alloc_wm (apop_model *model, char *type, void *free_fn, void *copy_fn, void *the_group)
 

Variables

apop_opts_type apop_opts
 
apop_modelapop_beta
 
apop_modelapop_bernoulli
 
apop_modelapop_binomial
 
apop_modelapop_chi_squared
 
apop_modelapop_dirichlet
 
apop_modelapop_exponential
 
apop_modelapop_f_distribution
 
apop_modelapop_gamma
 
apop_modelapop_improper_uniform
 
apop_modelapop_iv
 
apop_modelapop_kernel_density
 
apop_modelapop_loess
 
apop_modelapop_logit
 
apop_modelapop_lognormal
 
apop_modelapop_multinomial
 
apop_modelapop_multivariate_normal
 
apop_modelapop_normal
 
apop_modelapop_ols
 
apop_modelapop_pmf
 
apop_modelapop_poisson
 
apop_modelapop_probit
 
apop_modelapop_t_distribution
 
apop_modelapop_uniform
 
apop_modelapop_wls
 
apop_modelapop_yule
 
apop_modelapop_zipf
 
apop_modelapop_coordinate_transform
 
apop_modelapop_composition
 
apop_modelapop_dconstrain
 
apop_modelapop_mixture
 
apop_modelapop_stack
 

Macro Definition Documentation

#define Apop_c (   d,
  col 
)

A macro to generate a temporary one-column view of apop_data set d, pulling out only column col. After this call, outd will be a pointer to this temporary view, that you can use as you would any apop_data set.

See also
Apop_cs, Apop_cv, Apop_col_tv, Apop_col_t, Apop_matrix_col
#define Apop_col_t (   d,
  colname,
  outd 
)

After this call, v will hold a view of an apop_data set consisting only of vector view of the colth column of the apop_data set m. Unlike Apop_c, the second argument is a column name, that I'll look up using apop_name_find.

See also
Apop_cs, Apop_c, Apop_cv, Apop_col_tv, Apop_matrix_col
#define Apop_col_tv (   m,
  col,
 
)

After this call, v will hold a vector view of the colth column of the apop_data set m. Unlike Apop_cv, the second argument is a column name, that I'll look up using apop_name_find.

See also
Apop_cs, Apop_c, Apop_cv, Apop_col_t, Apop_matrix_col
#define Apop_col_v (   m,
  col,
 
)

Deprecated. Use Apop_cv.

#define Apop_cols (   d,
  colnum,
  len,
  outd 
)

Deprecated. Use Apop_cs.

#define Apop_cs (   d,
  colnum,
  len 
)

A macro to generate a temporary view of apop_data set d, beginning at column col and having length len. It expires as soon as the program leaves the current scope (like with the usual automatically declared vars).

See also
Apop_c, Apop_cv, Apop_col_tv, Apop_col_t, Apop_matrix_col
#define Apop_cv (   data_to_view,
  col 
)

A macro to generate a temporary one-column view of the matrix in an apop_data set d, pulling out only column col. The view is a gsl_vector set.

As usual, column -1 is the vector element of the apop_data set.

1 gsl_vector *v = Apop_cv(your_data, i);
2 
3 for (int i=0; i< your_data->matrix->size2; i++)
4  printf("Σ_%i = %g\n", i, apop_vector_sum(Apop_c(your_data, i)));

The view is automatically allocated, and disappears as soon as the program leaves the scope in which it is declared.

See also
Apop_cs, Apop_c, Apop_col_tv, Apop_col_t, Apop_matrix_col
#define apop_data_add_names (   dataset,
  type,
  ... 
)    apop_data_add_names_base((dataset), (type), (char const*[]) {__VA_ARGS__, NULL})

Add a list of names to a data set.

  • Use this with a list of names that you type in yourself, like
    1 apop_data_add_names(mydata, 'c', "age", "sex", "height");
    Notice the lack of curly braces around the list.
  • You may have an array of names, probably autogenerated, that you would like to add. In this case, make certain that the last element of the array is NULL, and call the base function:
    1 char **[] colnames = {"age", "sex", "height", NULL};
    2 apop_data_add_names_base(mydata, 'c', colnames);
    If you forget the NULL marker, this has good odds of segfaulting. You may prefer to use a for loop that inserts each name in turn using apop_name_add.
See also
apop_name_add, although apop_data_add_names will be more useful in most cases.
#define apop_data_falloc (   sizes,
  ... 
)    apop_data_fill(apop_data_alloc sizes, __VA_ARGS__)

Allocate a data set and fill it with values. Put the data set dimensions (one, two, or three dimensions as per apop_data_alloc) in parens, then the data (as per apop_data_fill). E.g.:

1 apop_data *identity2 = apop_data_falloc((2,2),
2  1, 0,
3  0, 1);
4 
5 apop_data *count_vector = apop_data_falloc((5), 0, 1, 2, 3, 4);

If you forget the parens, you will get an obscure error during compilation.

  • This is a pretty simple macro wrapping apop_data_fill and apop_data_alloc, because they appear together so often. The second example expands to:
    1 apop_data *count_vector = apop_data_fill(apop_data_alloc(5), 0, 1, 2, 3, 4);
#define apop_data_fill (   adfin,
  ... 
)    apop_data_fill_base((adfin), (double []) {__VA_ARGS__})

Fill a pre-allocated data set with values.

For example:

1 #include <apop.h>
2 
3 int main(){
4  apop_data *a =apop_data_alloc(2,2,2);
5  double eight = 8.0;
6  apop_data_fill(a, 8, 2.2, eight/2,
7  0, 6.0, eight);
8  apop_data_show(a);
9 }

Warning: I need as many arguments as the size of the data set, and can't count them for you. Too many will be ignored; too few will produce unpredictable results, which may include padding your matrix with garbage or a simple segfault.

Underlying this function is a base function that takes a single list, as opposed to a set of unassociated numbers as above:

1 #include <apop.h>
2 
3 int main(){
4  apop_data *a =apop_data_alloc(2,2,2);
5  double eight = 8.0;
6  double list[] = {8, 2.2, eight/2,
7  0, 6.0, eight};
8  apop_data_fill_base(a, list);
9  apop_data_show(a);
10 }
Parameters
adfinAn apop_data set (that you have already allocated).
...A series of at least as many floating-point values as there are blanks in the data set.
Returns
A pointer to the same data set that was input.
  • I assume that vector->size==matrix->size1; otherwise I just use matrix->size1.
  • See also apop_data_falloc to allocate and fill on one line. E.g., to generate a unit vector for three dimensions:
    1 apop_data *unit_vector = apop_data_falloc((3), 1, 1, 1);
See also
apop_text_fill, apop_data_falloc, apop_data_unpack
#define apop_data_free (   freeme)    (apop_data_free_base(freeme) ? 0 : ((freeme)= NULL))

Free an apop_data structure.

As with free(), it is safe to send in a NULL pointer (in which case the function does nothing).

If the more pointer is not NULL, I will free the pointed-to data set first. If you don't want to free data sets down the chain, set more=NULL before calling this.

  • This is actually a macro (that calls apop_data_free_base to do the real work). It sets freeme to NULL when it's done, because there's nothing safe you can do with the freed location, and you can later safely test conditions like if (data) ....
#define apop_data_prune_columns (   in,
  ... 
)

Keep only the columns of a data set that you name.

Parameters
inThe data set to prune.
...A list of names to retain (i.e. the columns that shouldn't be pruned out). For example, if you have run apop_data_summarize, you have columns for several statistics, but may care about only one or two; see the example.

For example:

#include <apop.h>
/* This sample produces a dummy times table, gets a summary, and prunes the summary table.
If you are not a test script, uncomment the last line to display the pruned table. */
int main(){
int i, j;
apop_data *d = apop_data_alloc(0, 10, 4);
for (i=0; i< 10; i++)
for (j=0; j< 4; j++)
apop_data_set(d, i, j, i*j);
apop_data_prune_columns(summary, "mean", "median");
assert(apop_name_find(summary->names, "mean", 'c')!=-2);
assert(apop_name_find(summary->names, "median", 'c')!=-2);
assert(apop_name_find(summary->names, "max", 'c')==-2); //not found
assert(apop_name_find(summary->names, "variance", 'c')==-2); //not found
assert(apop_data_get(summary, .row=0, .colname="mean")==0);
assert(apop_data_get(summary, .row=1, .colname="median")==4);
assert(apop_data_get(summary, .row=2, .colname="median")==8);
//apop_data_show(summary);
}
  • I use a case-insensitive search to find your column.
  • If your name multiple columns, I'll only give you the first.
  • If I can't find a column matching one of your strings, I throw an error to the screen and continue.
  • This is a macro calling apop_data_prune_columns_base. It packages your list of columns into a list of strings, adds a NULL string at the end, and calls that function.
#define apop_gaussian

Alias for the apop_normal distribution, qv.

#define Apop_matrix_col (   m,
  col,
 
)
Value:
gsl_vector apop_vv_##v = gsl_matrix_column((m), (col)).vector;\
gsl_vector * v = &( apop_vv_##v );

View a single column of a gsl_matrix as a gsl_vector. This is a convenience macro wrapping gsl_matrix_column.

Parameters
mThe gsl_matrix
colThe number of the desired column.
vThe name of the vector view that will be created.

An: example

1 gsl_matrix *m = [fill matrix here];
2 Apop_matrix_col(m, 2, coltwo);
3 Apop_matrix_col(m, 3, colthree);
4 printf("The correlation coefficient between columns two "
5  "and three is %g.\n", apop_vector_correlation(coltwo, colthree));
See also
Apop_cs, Apop_c, Apop_cv, Apop_col_tv, Apop_col_t
#define Apop_matrix_row (   m,
  row,
 
)
Value:
gsl_vector apop_vv_##v = gsl_matrix_row((m), (row)).vector;\
gsl_vector * v = &( apop_vv_##v );

View a single row of a gsl_matrix as a gsl_vector. This is a convenience macro wrapping gsl_matrix_row.

Parameters
mThe gsl_matrix
rowThe number of the desired row.
vThe name of the vector view that will be created.

An: example

1 gsl_matrix *m = [fill matrix here];
2 Apop_matrix_row(m, 2, rowtwo);
3 Apop_matrix_row(m, 3, rowthree);
4 printf("The correlation coefficient between rows two "
5  "and three is %g.\n", apop_vector_correlation(rowtwo, rowthree));
See also
Apop_rs, Apop_r, Apop_row_v, Apop_row_tv, Apop_row_t
#define Apop_maybe_abort (   level)
Value:
{if ((level == -5 && apop_opts.stop_on_warning!='n') \
|| (apop_opts.verbose >= level && apop_opts.stop_on_warning == 'v') \
|| (apop_opts.stop_on_warning=='w') ) \
raise(SIGTRAP);}
apop_opts_type apop_opts
Definition: apop_db.c:10
int verbose
Definition: apop.h:139
char stop_on_warning
Definition: apop.h:140
#define apop_model_coordinate_transform (   ...)    Apop_model_copy_set(apop_coordinate_transform, apop_ct, __VA_ARGS__)

Build an apop_coordinate_transform model, qv.

Returns
An apop_model that is a copy of apop_coordinate_transform and is appropriately set up.
  • Uses the apop_ct_settings group. This macro takes elements of that struct as inputs.
#define apop_model_dcompose (   ...)    Apop_model_copy_set(apop_composition, apop_composition, __VA_ARGS__)

Data composition is using either random draws or parameter estimates from the output of one model as the input data for another model.

Returns
An apop_model that is a copy of the apop_composition model.
#define apop_model_dconstrain (   ...)    Apop_model_copy_set(apop_dconstrain, apop_dconstrain, __VA_ARGS__)

Build an apop_dconstrain model, q.v., which applies a data constraint to the data set. For example, this is how one would truncate a model to have data above zero.

Returns
An apop_model that is a copy of apop_dconstrain and is appropriately set up.
#define apop_model_mixture (   ...)    apop_model_mixture_base((apop_model *[]){__VA_ARGS__, NULL})

Produce a model as a linear combination of other models. See the documentation for the apop_mixture model.

Parameters
...A list of models, either all parameterized or all unparameterized. See examples in the apop_mixture documentation.
#define apop_model_print_hash (   m1)
Value:
((m1)->log_likelihood ? (size_t)(m1)->log_likelihood : \
(m1)->p ? (size_t)(m1)->p*33 : \
(m1)->estimate ? (size_t)(m1)->estimate*33*33 : \
(m1)->draw ? (size_t)(m1)->draw*33*27 : \
(m1)->cdf ? (size_t)(m1)->cdf*27*27 \
: 27)
#define apop_model_stack (   ...)    apop_model_stack_base((apop_model *[]){__VA_ARGS__, NULL})

Generate a model consisting of several models bound together. The output apop_model is a copy of apop_stack; see that model's documentation for details.

Sample use:

1 apop_model *m1 = apop_model_set_parameters(apop_normal, 0, 1);
2 apop_model *m2 = apop_model_copy(m1);
3 apop_model *m3 = apop_model_copy(m1);
4 apop_model *two_independent_normals = apop_model_stack(n1, n2);
5 apop_model *three_independent_normals = apop_model_stack(n1, n2, n3);
6 
7 //But you don't have to parameterize ahead of time. E.g.
8 apop_model *two_n = apop_model_stack(
9  apop_model_copy(apop_normal),
10  apop_model_copy(apop_normal)
11  );
12 apop_model *estimated_norms = apop_estimate(indata, two_n);
  • If you input only one model, return a copy of that model; print a warning iff apop_opts.verbose >= 1.
    Exceptions
    error=='n'First model input is NULL.
#define apop_mrow (   m,
  row 
)
Value:
( \
((m) == NULL || (m)->size1 <= (row) || (row) < 0) ? NULL \
: &(gsl_matrix){.size1=1, .size2=(m)->size2, \
.tda=(m)->tda, .data=gsl_matrix_ptr((m), (row), 0)} \
)
#define Apop_notify (   verbosity,
  ... 
)
Value:
{\
if (apop_opts.verbose != -1 && apop_opts.verbose >= verbosity) { \
if (!apop_opts.log_file) apop_opts.log_file = stderr; \
fprintf(apop_opts.log_file, "%s: ", __func__); fprintf(apop_opts.log_file, __VA_ARGS__); fprintf(apop_opts.log_file, "\n"); \
fflush(apop_opts.log_file); \
} }
apop_opts_type apop_opts
Definition: apop_db.c:10
FILE * log_file
Definition: apop.h:149
int verbose
Definition: apop.h:139

Notify the user of errors, warning, or debug info.

Parameters
verbosityAt what verbosity level should the user be warned? E.g., if level==2, then print iff apop_opts.verbosity >= 2.
...The message to write to STDERR (presuming the verbosity level is high enough). This can be a printf-style format with following arguments. You can produce much more informative error messages this way, e.g., apop_notify(0, "Beta is %g but should be greater than zero.", beta);.
#define Apop_r (   d,
  rownum 
)

A macro to generate a temporary one-row view of apop_data set d, pulling out only row row. The view is also an apop_data set, with names and other decorations.

1 apop_data *v = Apop_r(your_data, i);
2 
3 for (int i=0; i< your_data->matrix->size1; i++)
4  apop_data_print(Apop_r(your_data, i));

The view is automatically allocated, and disappears as soon as the program leaves the scope in which it is declared.

See also
Apop_rs, Apop_row_v, Apop_row_tv, Apop_row_t, Apop_matrix_row
#define apop_rng_get_thread (   thread_in)

The gsl_rng is not itself thread-safe, in the sense that it can not be used simultaneously by multiple threads. However, if each thread has its own gsl_rng, then each will safely operate independently.

Thus, Apophenia keeps an internal store of RNGs for use by threaded functions. If the input to this function, thread, is greater than any previous input, then the array of gsl_rngs is extended to length thread, and each element extended using ++apop_opts.rng_seed (i.e., the seed is incremented before use).

Parameters
thread_inThe number of the RNG to retrieve, starting at zero (which is how OpenMP numbers its threads). If blank, I'll look up the current thread (via omp_get_thread_num) for you.
Returns
The appropriate RNG, initialized if necessary.
#define Apop_row (   d,
  row,
  outd 
)

Deprecated. Use Apop_r.

#define Apop_row_t (   d,
  rowname,
  outd 
)

After this call, v will hold a view of an apop_data set consisting only of the rowth row of the apop_data set m. Unlike Apop_r, the second argument is a row name, that I'll look up using apop_name_find.

See also
Apop_rs, Apop_r, Apop_rv, Apop_row_tv, Apop_matrix_row
#define Apop_row_tv (   m,
  row,
 
)

After this call, v will hold a vector view of the rowth row of the apop_data set m. Unlike Apop_rv, the second argument is a row name, that I'll look up using apop_name_find.

See also
Apop_rs, Apop_r, Apop_rv, Apop_row_t, Apop_matrix_row
#define Apop_rows (   d,
  rownum,
  len,
  outd 
)

Deprecated. Use Apop_rs.

#define Apop_rs (   d,
  rownum,
  len 
)

A macro to generate a temporary view of apop_data set d, beginning at row row and having length len. The view expires as soon as the program leaves the current scope (like with the usual automatically declared vars).

See also
Apop_r, Apop_rv, Apop_row_tv, Apop_row_t, Apop_matrix_row
#define Apop_rv (   data_to_view,
  row 
)

A macro to generate a temporary one-row view of the matrix in an apop_data set d, pulling out only row row. The view is a gsl_vector set.

1 gsl_vector *v = Apop_rv(your_data, i);
2 
3 for (int i=0; i< your_data->matrix->size1; i++)
4  printf("Σ_%i = %g\n", i, apop_vector_sum(Apop_r(your_data, i)));

The view is automatically allocated, and disappears as soon as the program leaves the scope in which it is declared.

See also
Apop_rows, Apop_row_v, Apop_row_tv, Apop_row_t, Apop_matrix_row
#define Apop_settings_copy (   name,
  ... 
)
Value:
void * name##_settings_copy(name##_settings *in) {\
name##_settings *out = malloc(sizeof(name##_settings)); \
*out = *in; \
__VA_ARGS__; \
return out; \
}

A convenience macro for declaring the copy function for a new settings group. See the documentation outline -> models -> model settings -> writing new settings group for details.

To just do a direct copy, the default works; let your settings group be named ysg:

1 Apop_settings_copy (ysg, )

generates a function that allocates space for a new settings group and copies all elements from the input group to the output group.

The space after the comma indicates that there is no new procedural code. If you want to add some, feel free. E.g.,

1 Apop_settings_copy (ysg,
2  if (!in->score)
3  out->score = 1;
4  out->data_owner = 0;
5 )

The names in and out are built into the macro.

#define Apop_settings_declarations (   ysg)
Value:
ysg##_settings * ysg##_settings_init(ysg##_settings); \
void * ysg##_settings_copy(ysg##_settings *); \
void ysg##_settings_free(ysg##_settings *);
#define Apop_settings_free (   name,
  ... 
)
Value:
void name##_settings_free(name##_settings *in) {\
__VA_ARGS__; \
free(in); \
}

A convenience macro for declaring the delete function for a new settings group. See the documentation outline -> models -> model settings -> writing new settings group for details.

If you don't have internal structure elements to free, let your settings group be named ysg:

1 Apop_settings_free (ysg, )

generates a function that simply frees the input settings group.

If your structure is pointing to other structures that need to be freed first, then add them after that comma:

1 Apop_settings_copy (ysg,
2  apop_data_free(in->dataset);
3 )

The name in is built into the macro.

#define Apop_settings_init (   name,
  ... 
)
Value:
name##_settings *name##_settings_init(name##_settings in) { \
name##_settings *out = malloc(sizeof(name##_settings)); \
*out = in; \
__VA_ARGS__; \
return out; \
}

A convenience macro for declaring the initialization function for a new settings group. See the documentation outline -> models -> model settings -> writing new settings group for details.

This sets the defaults for every element in the structure, so you will want a line for every element of your structure (except the ones that default to NULL, which have already been set as such).

1 Apop_settings_init (ysg,
2  Apop_varad_set(size1, 99);
3  Apop_varad_set(size2, 2.3);
4  Apop_varad_set(dataset, apop_data_alloc(out->size1, out->size2));
5  )

If you need them, the input is a structure named in, and the output a pointer-to-struct named out.

#define Apop_stopif (   test,
  onfail,
  level,
  ... 
)
Value:
do {\
if (test) { \
Apop_notify(level, __VA_ARGS__); \
Apop_maybe_abort(level) \
onfail; \
} } while(0)
#define Apop_notify(verbosity,...)
Definition: apop.h:980

Execute an action and print a message to stderr (or the current FILE handle held by apop_opts.log_file). Intended for leaving a function on failure.

Parameters
testThe expression that, if true, triggers all the action.
onfailIf the assertion fails, do this. E.g., out->error='x'; return GSL_NAN. Notice that it is OK to include several lines of semicolon-separated code here, but if you have a lot to do, the most readable option may be goto outro, plus an appropriately-labeled section at the end of your function.
levelPrint the warning message only if apop_opts.verbose is greater than or equal to this. Zero usually works, but for minor infractions use one.
...The error message in printf form, plus any arguments to be inserted into the printf string. I'll provide the function name and a carriage return.
  • If apop_opts.stop_on_warning is nonzero and not 'v', then a failed test halts via abort(), even if the apop_opts.verbose level is set so that the warning message doesn't print to screen. Use this when running via debugger.
  • If apop_opts.stop_on_warning is 'v', then a failed test halts via abort() iff the verbosity level is high enough to print the error.
#define Apop_subm (   data_to_view,
  srow,
  scol,
  nrows,
  ncols 
)

Find the mean of the input vector. Generate a subview of a submatrix within a gsl_matrix. Like Apop_r, et al., the view is an automatically-allocated variable that is lost once the program flow leaves the scope in which it is declared.

Parameters
data_to_viewThe root matrix
srowthe first row (in the root matrix) of the top of the submatrix
scolthe first column (in the root matrix) of the left edge of the submatrix
nrowsnumber of rows in the submatrix
ncolsnumber of columns in the submatrix
#define APOP_SUBMATRIX (   m,
  srow,
  scol,
  nrows,
  ncols,
 
)

Deprecated. Use Apop_subm.

#define apop_subvector (   v,
  start,
  len 
)
Value:
( \
((v) == NULL || (v)->size < ((start)+(len)) || (start) < 0) ? NULL \
: &(gsl_vector){.size=(len), .stride=(v)->stride, .data=(v)->data+(start*(v)->stride)})
#define apop_text_fill (   dataset,
  ... 
)    apop_text_fill_base((dataset), (char* []) {__VA_ARGS__, NULL})

Fill the text part of an already-allocated apop_data set with a list of strings.

Parameters
datasetA data set that you already prepared with apop_text_alloc.
...A list of strings. The first row is filled first, then the second, and so on to the end of the text grid.
  • No NULL strings. A blank string, "" is OK.
  • If you provide more or fewer strings than are needed to fill the text grid and apop_opts.verbose >=1, I print a warning and continue to the end of the text grid or data set, whichever is shorter.
  • If the data set is NULL, I return NULL. If you provide a NULL data set but a non-NULL list of text elements, and apop_opts.verbose >=1, I print a warning and return NULL.
  • Remember that the C preprocessor concatenates two adjacent strings into one. Here is an attempt to fill a $ 2\times 3$ grid:
    1 apop_data *one23 = apop_text_fill(apop_text_alloc(NULL, 2, 3),
    2  "one", "two", "three" //missing comma!
    3  "two", "four", "six");
    The preprocessor will join "three" "two" to form "threetwo", leaving you with only five strings.
  • If you have a NULL-delimited array of strings (not just a loose list as above), then use apop_text_fill_base.
#define apop_update_hash (   m1,
  m2 
)
Value:
( \
((m1)->log_likelihood ? (size_t)(m1)->log_likelihood : \
(m1)->p ? (size_t)(m1)->p*33 : \
(m1)->draw ? (size_t)(m1)->draw*33*27 \
: 33*27*19) \
+((m2)->log_likelihood ? (size_t)(m2)->log_likelihood : \
(m2)->p ? (size_t)(m2)->p*33 : \
(m2)->draw ? (size_t)(m2)->draw*33*27 \
: 33*27*19 \
) * 37)
#define apop_varad_declare (   type,
  name,
  ... 
)
Value:
typedef struct { \
__VA_ARGS__ ; \
} variadic_type_##name; \
apop_varad_head(type, name);
#define apop_vector_fill (   avfin,
  ... 
)    apop_vector_fill_base((avfin), (double []) {__VA_ARGS__})

Fill a pre-allocated gsl_vector with values.

See apop_data_alloc for a relevant example. See also apop_matrix_alloc.

Warning: I need as many arguments as the size of the vector, and can't count them for you. Too many will be ignored; too few will produce unpredictable results, which may include padding your vector with garbage or a simple segfault.

Parameters
avfinA gsl_vector (that you have already allocated).
...A series of exactly as many values as there are spaces in the vector.
Returns
A pointer to the same vector that was input.

Typedef Documentation

typedef struct apop_data apop_data

The apop_data structure represents a data set. It primarily joins together a gsl_vector, a gsl_matrix, and a table of strings, then gives them all row and column names. It tries to be minimally intrusive, so you can use it everywhere you would use a gsl_matrix or a gsl_vector.

If you are viewing the HTML documentation, here is a diagram showing a sample data set with all of the elements in place. Together, they represet a data set where each row is an observation, which includes both numeric and text values, and where each row/column is named.

RownameVector Matrix TextWeights
"Steven"
"Sandra"
"Joe"
Outcome
1
0
1
Age Weight (kg) Height (cm)
32 65 175
41 61 165
40 73 181
Sex State
Male Alaska
Female Alabama
Male Alabama
1
3.2
2.4

Allocate using apop_data_alloc, free via apop_data_free, or more generally, see the apop_data_... section of the index (in the header links) for the many other functions that operate on this struct.

See also the Data Sets section of the outline page (also in the header links) for further notes on getting and manipulating the elements of an apop_data set.

A proposal distribution for apop_mcmc_settings and its accompanying functions and information. By default, these will be apop_multivariate_normal models. The step_fn and adapt_fn have to be written around the model and your preferences. For the defaults, the step function recenters the mean of the distribution around the last accepted proposal, and the adapt function widens the Σ for the Normal if the accept rate is too low; narrows it if the accept rate is too large.

You may provide an array of proposals. The length of the list of proposals must match the number of chunks, as per the gibbs_chunks setting in the apop_mcmc_settings group that the array of proposals is a part of. Each proposal must be initialized to include all elements, and the step and adapt functions probably have to be written anew for each type of model.

This segment of the interface is in beta. A future revision may make it easier to design new proposals.

typedef struct apop_model apop_model

A statistical model.

Function Documentation

apop_data* apop_anova ( char *  table,
char *  data,
char *  grouping1,
char *  grouping2 
)

This function produces a traditional one- or two-way ANOVA table. It works from data in an SQL table, using queries of the form select data from table group by grouping1, grouping2.

Parameters
tableThe table to be queried. Anything that can go in an SQL from clause is OK, so this can be a plain table name or a temp table specification like (select ... ), with parens.
dataThe name of the column holding the count or other such data
grouping1The name of the first column by which to group data
grouping2If this is NULL, then the function will return a one-way ANOVA. Otherwise, the name of the second column by which to group data in a two-way ANOVA.
int apop_arms_draw ( double *  out,
gsl_rng *  r,
apop_model m 
)

Adaptive rejection metropolis sampling.

This is a function to make random draws from any univariate distribution (more or less).

The author, Wally Gilks, explains on http://www.amsta.leeds.ac.uk/~wally.gilks/adaptive.rejection/web_page/Welcome.html, that ``ARS works by constructing an envelope function of the log of the target density, which is then used in rejection sampling (see, for example, Ripley, 1987). Whenever a point is rejected by ARS, the envelope is updated to correspond more closely to the true log density, thereby reducing the chance of rejecting subsequent points. Fewer ARS rejection steps implies fewer point-evaluations of the log density.''

  • It accepts only functions with univariate inputs. I.e., it will put a single value in the vector part of a apop_data set, and then evaluate the log likelihood at that point.
  • It is currently the default for the apop_draw function, so you can just call that if you prefer.
  • There are a great number of parameters, in the apop_arms_settings structure. The structure also holds a history of the points tested to date. That means that the system will be more accurate as more draws are made. It also means that if the parameters change, or you use apop_model_copy, you should call Apop_settings_rm_group(your_model, apop_arms) to clear the model of points that are not valid for a different situation.
  • See apop_arms_settings for the list of parameters that you may want to set, via a form like apop_model_add_group(your_model, apop_arms, .model=your_model, .xl=8, .xr =14);. The model element is mandatory; you'll get a run-time complaint if you forget it.
apop_model* apop_beta_from_mean_var ( double  m,
double  v 
)

The Beta distribution is useful for modeling because it is bounded between zero and one, and can be either unimodal (if the variance is low) or bimodal (if the variance is high), and can have either a slant toward the bottom or top of the range (depending on the mean).

The distribution has two parameters, typically named $\alpha$ and $\beta$, which can be difficult to interpret. However, there is a one-to-one mapping between (alpha, beta) pairs and (mean, variance) pairs. Since we have good intuition about the meaning of means and variances, this function takes in a mean and variance, calculates alpha and beta behind the scenes, and returns a random draw from the appropriate Beta distribution.

Parameters
mThe mean the Beta distribution should have. Notice that m is in [0,1].
vThe variance which the Beta distribution should have. It is in (0, 1/12), where (1/12) is the variance of a Uniform(0,1) distribution. Funny things happen with variance near 1/12 and mean far from 1/2.
Returns
Returns an apop_beta model with its parameters appropriately set.
Exceptions
out->error=='r'Range error: mean is not within [0, 1].
apop_data* apop_bootstrap_cov ( apop_data data,
apop_model model,
gsl_rng *  rng,
int  iterations,
char  keep_boots,
char  ignore_nans 
)

Give me a data set and a model, and I'll give you the bootstrapped covariance matrix of the parameter estimates.

Parameters
dataThe data set. An apop_data set where each row is a single data point. (No default)
modelAn apop_model, whose estimate method will be used here. (No default)
iterationsHow many bootstrap draws should I make? (default: 1,000)
rngAn RNG that you have initialized, probably with apop_rng_alloc. (Default: an RNG from apop_rng_get_thread)
keep_bootsIf 'y', then add a page to the output apop_data set with the statistics calculated for each bootstrap iteration. They are packed via apop_data_pack, so use apop_data_unpack if needed. (Default: 'n')
1 apop_data *boot_output = apop_bootstrap_cov(your_data, your_model, .keep_boots='y');
2 apop_data *boot_stats = apop_data_get_page(boot_output, "<bootstrapped statistics>");
3 
4 Apop_matrix_row(boot_stats->matrix, 27, row_27)
5 //If the output statistic is not just a vector, you'll need to use apop_data_unpack to put
6 //it into the right shape. Let's assume for now that it's just a vector:
7 printf("The statistics calculated on the 28th iteration:\n");
8 apop_vector_print(row_27);
ignore_nansIf 'y' and any of the elements in the estimation return NaN, then I will throw out that draw and try again. If 'n', then I will write that set of statistics to the list, NaN and all. I keep count of throw-aways; if there are more than iterations elements thrown out, then I throw an error and return with estimates using data I have so far. That is, I assume that NaNs are rare edge cases; if they are as common as good data, you might want to rethink how you are using the bootstrap mechanism. (Default: 'n')
Returns
An apop_data set whose matrix element is the estimated covariance matrix of the parameters.
Exceptions
out->error=='n'NULL input data.
out->error=='N'too many Nans.
See also
apop_jackknife_cov
void apop_crosstab_to_db ( apop_data in,
char *  tabname,
char *  row_col_name,
char *  col_col_name,
char *  data_col_name 
)

See apop_db_to_crosstab for the storyline; this is the complement, which takes a crosstab and writes its values to the database.

For example, I would take

c0c1
r023
r104

and do the following writes to the database:

1 insert into your_table values ('r0', 'c0', 2);
2 insert into your_table values ('r0', 'c1', 3);
3 insert into your_table values ('r1', 'c0', 3);
4 insert into your_table values ('r1', 'c1', 4);
  • If your data set does not have names (or not enough names), I will use the scheme above, filling in names of the form r0, r1, ... c0, c1, .... Text columns get their own numbering system, t0, t1, ..., which is a little more robust than continuing the column count from the matrix.
  • I handle only the matrix and text.
void apop_data_add_named_elmt ( apop_data d,
char *  name,
double  val 
)

A convenience function to add a named element to a data set. Many of Apophenia's testing procedures use this to easily produce a column of named parameters. It is public as a convenience.

Parameters
dThe apop_data structure. Must not be NULL, but may be blank (as per allocation via apop_data_alloc ( ) ).
nameThe name to add
valthe value to add to the set.
  • I use the position of the last non-empty row name to know where to put the value. If there are two names in the data set, then I will put the new name in the third name slot and the data in the third slot in the vector. If you use this function from start to finish in building your list, then you'll be fine.
  • If the vector is too short (or NULL), I will call apop_vector_realloc internally to make space.
  • This fits well with the defaults for apop_data_get. An example:
1 apop_data *list = apop_data_alloc();
2 apop_data_add_named_elmt(list, "height", 165);
3 apop_data_add_named_elmt(list, "weight", 60);
4 
5 double height = apop_data_get(list, .rowname="height");
apop_data* apop_data_add_page ( apop_data dataset,
apop_data newpage,
const char *  title 
)

Add a page to a apop_data set. It gets a name so you can find it later.

Parameters
datasetThe input data set, to which a page will be added.
newpageThe page to append
titleThe name of the new page. Remember, this is truncated at 100 characters.
Returns
The new page. I post a warning if I am appending or appending to a NULL data set and apop_opts.verbose >=1 .
  • Some data is fundamentally multi-page; an optimization search over multi-page parameters would search the space given by all pages, for example. Also, pages may be appended as output or auxiliary information, such as covariances—an MLE would not search over these elements. Generally, any page with a name in XML-ish brackets, such as <Covariance>, will be considered informational and ignored by search routines, missing data routines, et cetera. This is achieved by a rule in apop_data_pack and apop_data_unpack.

Here is a toy example that establishes a baseline data set, adds a page, modifies it, and then later retrieves it.

1 apop_data *d = apop_data_alloc(10, 10, 10); //the base data set.
2 apop_data *a_new_page = apop_data_add_page(d, apop_data_alloc(2,2), "new 2 x 2 page");
3 gsl_vector_set_all(a_new_page->matrix, 3);
4 
5 //later:
6 apop_data *retrieved = apop_data_get_page(d, "new", 'r'); //use regexes, not literal match.
7 apop_data_show(retrieved); //print a 2x2 grid of 3s.
apop_data* apop_data_alloc ( const size_t  size1,
const size_t  size2,
const int  size3 
)

Allocate a apop_data structure, to be filled with data.

  • The typical case is three arguments, like apop_data_alloc(2,3,4): vector size, matrix rows, matrix cols. If the first argument is zero, you get a NULL vector.
  • Two arguments, apop_data_alloc(2,3), would allocate just a matrix, leaving the vector NULL.
  • One argument, apop_data_alloc(2), would allocate just a vector, leaving the matrix NULL.
  • Zero arguments, apop_data_alloc(), will produce a basically blank set, with out->matrix==out->vector==NULL.

For allocating the text part, see apop_text_alloc.

The weights vector is set to NULL. If you need it, allocate it via

1 d->weights = gsl_vector_alloc(row_ct);
See also
apop_data_calloc
Returns
The apop_data structure, allocated and ready.
Exceptions
out->error=='a'Allocation error. The matrix, vector, or names couldn't be malloced, which probably means that you requested a very large data set.
  • An apop_data struct, by itself, is about 72 bytes. If I can't allocate that much memory, I return NULL. But if even this much fails, your computer may be on fire and you should go put it out.
apop_data* apop_data_calloc ( const size_t  size1,
const size_t  size2,
const int  size3 
)

Allocate a apop_data structure, to be filled with data; set everything in the allocated portion to zero. See apop_data_alloc for details.

Returns
The apop_data structure, allocated and zeroed out.
Exceptions
out->error=='m'malloc error; probably out of memory.
See also
apop_data_alloc
apop_data* apop_data_copy ( const apop_data in)

Copy one apop_data structure to another. That is, all data is duplicated.

Basically a front-end for apop_data_memcpy for those who prefer this sort of syntax.

Unlike apop_data_memcpy, I do follow the more pointer.

Parameters
inthe input data
Returns
a structure that this function will allocate and fill. If input is NULL, then this will be NULL.
Exceptions
out.error='a'Allocation error.
out.error='c'Cyclic link: D->more == D (may be later in the chain, e.g., D->more->more = D->more) You'll have only a partial copy.
out.error='d'Dimension error; should never happen.
out.error='p'Missing part error; should never happen.
  • If the input data set has an error, then I will copy it anyway, including the error flag (which might be overwritten). I print a warning if the verbosity level is >=1.
apop_data* apop_data_correlation ( const apop_data in)

Returns the matrix of correlation coefficients $(\sigma^2_{xy}/(\sigma_x\sigma_y))$ relating each column with each other.

Parameters
inA data matrix: rows are observations, columns are variables. If you give me a weights vector, I'll use it.
Returns
Returns the variance/covariance matrix relating each column with each other. This function allocates the matrix for you.
Exceptions
out->error='a'Allocation error.
apop_data* apop_data_covariance ( const apop_data in)

Returns the sample variance/covariance matrix relating each column of the matrix to each other column.

Parameters
inAn apop_data set. If the weights vector is set, I'll take it into account.
  • This is the sample covariance—dividing by $n-1$, not $n$.
Returns
Returns a apop_data set the variance/covariance matrix relating each column with each other.
Exceptions
out->error='a'Allocation error.
char apop_data_free_base ( apop_data freeme)

Free the elements of the given apop_data set and then the apop_data set itself. Intended to be used by apop_data_free, a macro that calls this to free elements, then sets the value to NULL.

  • apop_data_free is a macro that calls this function and, on success, sets the input pointer to NULL. For typical cases, that's slightly more useful than this function.
Exceptions
freeme.error='c'Circular linking is against the rules. If freeme->more == freeme, then I set freeme.error='c' and return. If you send in a structure like A -> B -> B, then both data sets A and B will be marked.
Returns
0 on OK, 'c' on error.
apop_data* apop_data_get_factor_names ( apop_data data,
int  col,
char  type 
)

Factor names are stored in an auxiliary table with a name like "<categories for your_var>". Producing this name is annoying (and prevents us from eventually making it human-language independent), so use this function to get the list of factor names.

Parameters
dataThe data set. (No default, must not be NULL)
colThe column in the main data set whose name I'll use to check for the factor name list. Vector==-1. (default=0)
typeIf you are referring to a text column, use 't'. (default='d')
Returns
A pointer to the page in the data set with the given factor names.
apop_data* apop_data_get_page ( const apop_data data,
const char *  title,
const char  match 
)

It's good form to get a page from your data set by name, because you may not know the order for the pages, and the stepping through makes for dull code anyway (apop_data *page = dataset; while (page->more) page= page->more;).

Parameters
dataThe apop_data set to use. No default; if NULL, gives a warning if apop_opts.verbose >=1 and returns NULL.
titleThe name of the page to retrieve. Default="Info", which is the name of the page of additional estimation information returned by estimation routines (log likelihood, status, AIC, BIC, confidence intervals, ...).
matchIf 'c', case-insensitive match (via strcasecmp); if 'e', exact match, if 'r' regular expression substring search (via apop_regex). Default='c'.
Returns
The page whose title matches what you gave me. If I don't find a match, return NULL.
apop_data* apop_data_listwise_delete ( apop_data d,
char  inplace 
)

If there is an NaN anywhere in the row of data (including the matrix, the vector, the weights, and the text) then delete the row from the data set.

  • If every row has an NaN, then this returns NULL.
  • If apop_opts.nan_string is not NULL, then I will make case-insensitive comparisons to the text elements to check for bad data as well.
  • If inplace = 'y', then I'll free each element of the input data set and refill it with the pruned elements. I'll still take up (up to) twice the size of the data set in memory during the function. If every row has an NaN, then your apop_data set will end up with NULL vector, matrix, .... if inplace = 'n', then the original data set is left unmolested.
  • I only look at the first page of data (i.e. the more element is ignored).
  • This function uses the Designated initializers syntax for inputs.
Parameters
dThe data, with NaNs
inplaceIf 'y', clear out the pointer-to-apop_data that you sent in and refill with the pruned data. If 'n', leave the set alone and return a new data set. Default='n'.
Returns
A (potentially shorter) copy of the data set, without NaNs. If inplace=='y', a pointer to the input, which was shortened in place. If the entire data set is cleared out, then this will be NULL.
void apop_data_memcpy ( apop_data out,
const apop_data in 
)

Copy one apop_data structure to another.

This function does not allocate the output structure or the vector, matrix, text, or weights elements—I assume you have already done this and got the dimensions right. I will assert that there is at least enough room in the destination for your data, and fail if the copy would write more elements than there are bins.

  • If you want space allocated or are unsure about dimensions, use apop_data_copy.
  • If both in and out have a more pointer, also copy subsequent page(s).
  • You can use the subsetting macros, Apop_r or Apop_rs, to copy within a data set:
1 //Copy the contents of row i of mydata to row j.
2 apop_data *fromrow = Apop_r(mydata, i);
3 apop_data *torow = Apop_r(mydata, j);
4 apop_data_memcpy(torow, fromrow);
5 
6 // or just
7 apop_data_memcpy(Apop_r(mydata, i), Apop_r(mydata, j));
Parameters
outA structure that this function will fill. Must be preallocated with the appropriate sizes.
inThe input data.
Exceptions
out.error='d'Dimension error; couldn't copy.
out.error='p'Part missing; e.g., in->matrix exists but out->matrix doesn't; couldn't copy.
apop_data* apop_data_pmf_compress ( apop_data in)

Say that you have added a long list of observations to a single apop_data set, meaning that each row has weight one. There are a huge number of duplicates, perhaps because there are a handful of types that keep repeating:

Vector valueText nameWeights
12Dozen1
1Single1
2Pair1
2Pair1
1Single1
1Single1
2Pair1
2Pair1

You would like to reduce this to a set of distinct values, with their weights adjusted accordingly:

Vector valueText nameWeights
12Dozen1
1Single3
2Pair4
Parameters
inAn apop_data set that may have duplicate rows. As above, the data may be in text and/or numeric formats. If there is a weights vector, I will add those weights together as duplicates are merged. If there is no weights vector, I will create one, which is initially set to one for all values, and then aggregated as above.
Returns
Your input is changed in place, via apop_data_rm_rows, so use apop_data_copy before calling this function if you need to retain the original format. For your convenience, this function returns a pointer to your original data, which has now been pruned.
apop_data* apop_data_prune_columns_base ( apop_data d,
char **  colnames 
)

Keep only the columns of a data set that you name. This is the function called internally by the apop_data_prune_columns macro. In most cases, you'll want to use that macro. An example of the two uses demonstrating the difference:

1 apop_data_prune_columns(d, "mean", "median");
2 
3 char *list[] = {"mean", "median", NULL};
4 apop_data_prune_columns_base(d, list);
Parameters
dThe data set to prune.
colnamesA null-terminated list of names to retain (i.e. the columns that shouldn't be pruned out).
Returns
A pointer to the input data set, now pruned.
apop_data* apop_data_rank_compress ( apop_data in)

One often finds data where the column indicates the value of the data point. There may be two columns, and a mark in the first indicates a miss while a mark in the second is a hit. Or say that we have the following list of observations:

1 2 3 3 2 1 1 2 1 1 2 1 1

Then we could write this as:

1 0 1 2 3
2 ----------
3 0 6 4 2

because there are six 1s observed, four 2s observed, and two 3s observed. We call this rank format, because 1 (or zero) is typically the most common, 2 is second most common, et cetera.

This function takes in a list of observations, and aggregates them into a single row in rank format.

  • You may be interested in apop_data_to_factors to convert real numbers or text into a matrix of categories.
  • The number of bins is simply the largest number found. So if there are bins {0, 1, 2} and your data set happens to consist of 0 0 1 1 0, then I won't know to generate results with three bins where the last bin has probability zero.
/* A round trip: generate Zipf-distributed draws, summarize them to a single list of
rankings, then expand the rankings to a list of single entries. The sorted list at the end
of this should be identical to the (sorted) original list. */
#include <apop.h>
int main(){
gsl_rng *r = apop_rng_alloc(2342);
int i, length = 1e4;
apop_data *draws = apop_data_alloc(length);
for (i=0; i< length; i++)
apop_draw(apop_data_ptr(draws, i, -1), r, a_zipf);
apop_data *by_rankings = apop_data_rank_compress(draws);
//The first row of the matrix is suitable for plotting.
//apop_data_show(by_rankings);
assert(apop_matrix_sum(by_rankings->matrix) == length);
apop_data *re_expanded = apop_data_rank_expand(by_rankings);
gsl_sort_vector(draws->vector);
gsl_sort_vector(re_expanded->vector);
assert(apop_vector_distance(draws->vector, re_expanded->vector) < 1e-5);
}
apop_data* apop_data_rank_expand ( apop_data in)

The complement to this is apop_data_rank_compress; see that function's documentation for the story and an example.

This function takes in a data set where the zeroth column includes the count(s) of times that zero was observed, the first gives the count(s) of times that one was observed, et cetera. It outputs a data set whose vector element includes a list that has exactly the given frequency of zeros, ones, et cetera.

void apop_data_rm_columns ( apop_data d,
int *  drop 
)

Remove the columns set to one in the drop vector. The returned data structure looks like it was modified in place, but the data matrix and the names are duplicated before being pared down, so if your data is taking up more than half of your memory, this may not work.

Parameters
dthe apop_data structure to be pared down.
dropan array of ints. If use[7]==1, then column seven will be cut from the output. A reminder: calloc(in->size2 , sizeof(int)) will fill your array with zeros on allocation, and memset(use, 1, in->size2 * sizeof(int)) will quickly fill an array of ints with nonzero values.
apop_data* apop_data_rm_page ( apop_data data,
const char *  title,
const char  free_p 
)

Remove the first page from an apop_data set that matches a given name.

Parameters
dataThe input data set, to which a page will be added. No default. If NULL, I return silently if apop_opts.verbose < 1 ; print an error otherwise.
titleThe case-insensitive name of the page to remove. Default: "Info"
free_pIf 'y', then apop_data_free the page. Default: 'y'.
Returns
If not freed, a pointer to the apop_data page that I just pulled out. Thus, you can use this to pull a single page from a data set. I set that page's more pointer to NULL, to minimize any confusion about more-than-linear linked list topologies. If free_p=='y' (the default) or the page is not found, return NULL.
  • I don't check the first page, so there's no concern that the head of your list of pages will move. Again, the intent of the ->more pointer in the apop_data set is not to fully implement a linked list, but primarily to allow you to staple auxiliary information to a main data set.
  • If I don't find the page you want, I return NULL, and print a message if apop_opts.verbose >= 1.
apop_data* apop_data_rm_rows ( apop_data in,
int *  drop,
apop_fn_ir  do_drop,
void *  drop_parameter 
)

Remove the rows set to one in the drop vector or for which the do_drop function returns one.

Parameters
inthe apop_data structure to be pared down
dropa vector with as many elements as the max of the vector, matrix, or text parts of in, with a one marking those columns to be removed.
do_dropA function that returns one for rows to drop and zero for rows to not drop. A sample function:
1 int your_drop_function(apop_data *onerow, void *extra_param){
2  return gsl_isnan(apop_data_get(onerow)) || !strcmp(onerow->text[0][0], "Uninteresting data point");
3 }
apop_data_rm_rows uses Apop_r to get a subview of the input data set of height one (and since all the default arguments default to zero, you don't have to write out things like apop_data_get (onerow, .row=0, .col=0), which can help to keep things readable).
drop_parameterIf your do_drop function requires additional input, put it here and it will be passed through.
Returns
Returns a pointer to the input data set, now pruned.
  • If all the rows are to be removed, then you will wind up with the same apop_data set, with NULL vector, matrix, weight, and text. Therefore, you may wish to check for NULL elements after use. I remove rownames, but leave the other names, in case you want to add new data rows.
  • The typical use is to provide only a list or only a function. If both are NULL, I return without doing anything, and print a warning if apop_opts.verbose >=1. If you provide both, I will drop the row if either the vector has a one in that row's position, or if the function returns a nonzero value.
  • This function uses the Designated initializers syntax for inputs.
int apop_data_set_row ( apop_data d,
apop_data row,
int  row_number 
)

Now that you've used Apop_r to pull a row from an apop_data set, this function lets you write that row to another position in the same data set or a different data set entirely.

The set written to must have the same form as the original:

  • a vector element has to be present if one existed in the original,
  • same for the weights vector,
  • the matrix in the destination has to have as many columns as in the original, and
  • the text has to have a row long enough to hold the original
  • If the row to be written to already has a rowname, it is overwritten. If d->names->rowct == row_number (all rows up to row_number have row names), then extend the list of row names by one to add the new name. Else, don't add the row name.
  • Column names (of all types) aren't touched. Maybe use apop_data_copy or apop_name_copy if you need to copy these names.

If any of the source elements are NULL, I won't bother to check that element in the destination.

Returns
0=OK, -1=error (probably a source/destination size mismatch).
  • The error codes for out-of-bounds errors are thread-safe iff you are have a C11-compliant compiler (thanks to the _Thread_local keyword) or a version of GCC with the __thread extension enabled.
apop_data* apop_data_sort ( apop_data data,
apop_data sort_order,
char  asc,
char  inplace,
double *  col_order 
)

Sort an apop_data set on an arbitrary sequence of columns.

The sort_order set is a one-row data set that should look like the data set being sorted. The easiest way to generate it is to use Apop_r to pull one row of the table, then copy and fill it. For each column you want used in the sort, assign a ranking giving whether the column should be sorted first, second, .... Columns you don't want used in the sorting should be set to NAN. Ties are broken by the earlier element in the default order (see below).

E.g., to sort by the last column of a five-column matrix first, then the next-to-last column, then the next-to-next-to-last, then by the first text column, then by the second text column:

1 apop_data *sort_order = apop_data_copy(Apop_r(data, 0));
2 sort_order->vector = NULL; //so it will be skipped.
3 Apop_data_fill(sort_order, NAN, NAN, 3, 2, 1);
4 apop_text_add(sort_order, 0, 0, "4");
5 apop_text_add(sort_order, 0, 1, "5");
6 apop_data_sort(data, sort_order);

I use only comparisons, not the actual numeric values, so you can use any sequence of numbers: (1, 2, 3) and (-1.32, 0, 27) work identically.

  • Strings are sorted case-insensitively, using strcasecmp. [exercise for the reader: modify the source to use Glib's locale-correct string sorting.]
  • The setup generates a lexicographic sort using the columns you specify. If you would like a different sort order, such as Euclidian distance to the origin, you can generate a new column expressing your preferred metric, and then sorting on that. See the example below.
Parameters
dataThe data set to be sorted. If NULL, this function is a no-op that returns NULL.
sort_orderA apop_data set describing the order in which columns are used for sorting, as above. If NULL, then sort by the vector, then each matrix column, then text, then weights, then row names.
inplaceIf 'n', make a copy, else sort in place. (default: 'y').
ascIf 'a', ascending; if 'd', descending. This is applied to all columns; column-by-column application is to do. (default: 'a').
col_orderFor internal use only. In your call, it should be NULL; the Designated initializers syntax will takes care of it for you.
Returns
A pointer to the sorted data set. If inplace=='y' (the default), then this is the same as the input set.

A few examples:

#ifdef Datadir
#define DATADIR Datadir
#else
#define DATADIR "."
#endif
#include <apop.h>
#include <unistd.h>
#ifdef Testing
#include "sort_tests.c" //For Apophenia's test suite, some tedious checks that the sorts worked
#endif
//get_distance is for the sort-by-Euclidian distance example below.
double get_distance(gsl_vector *v) {return apop_vector_distance(v);}
int main(){
apop_text_to_db( DATADIR "/" "amash_vote_analysis.csv" , .tabname="amash_vote_analysis");
apop_data *d = apop_query_to_mixed_data("mntmtm", "select 1,id,party,contribs/1000.0,vote,ideology from amash_vote_analysis ");
//use the default order of columns for sorting
apop_data *sorted = apop_data_sort(d, .inplace='n');
#ifndef Testing
apop_data_print(sorted);
#else
check_sorting1(sorted);
#endif
//set up a specific column order
perm->vector = NULL;
apop_data_fill(perm, 5, 3, 4);
apop_text_add(perm, 0, 0, "2");
apop_text_add(perm, 0, 1, "1");
apop_data_sort(d, perm);
#ifndef Testing
#else
check_sorting2(d);
#endif
//sort a list of names
apop_data_add_names(blank, 'r', "C", "E", "A");
assert(*blank->names->row[0] == 'A');
assert(*blank->names->row[1] == 'C');
assert(*blank->names->row[2] == 'E');
//take each row of the matrix as a vector; store the Euclidian distance to the origin in the vector;
//sort in descending order.
apop_data *rowvectors = apop_text_to_data( DATADIR "/" "test_data" );
apop_map(rowvectors, .fn_v=get_distance, .part='r', .inplace='y');
apop_data *arow = apop_data_copy(Apop_r(rowvectors, 0));
arow->matrix=NULL; //sort only by the distance vector
apop_data_sort(rowvectors, arow, .asc='d');
#ifndef Testing
apop_data_show(rowvectors);
#else
double prev = INFINITY;
for (int i=0; i< rowvectors->vector->size; i++){
double this = apop_data_get(rowvectors, i, -1);
assert(this < prev);
prev = this;
}
#endif
}
apop_data** apop_data_split ( apop_data in,
int  splitpoint,
char  r_or_c 
)

Split one input apop_data structure into two.

For the opposite operation, see apop_data_stack.

Parameters
inThe apop_data structure to split
splitpointThe index of what will be the first row/column of the second data set. E.g., if this is -1 and r_or_c=='c', then the whole data set will be in the second data set; if this is the length of the matrix then the whole data set will be in the first data set. Another way to put it is that splitpoint will equal the number of rows/columns in the first matrix (unless it is -1, in which case the first matrix will have zero rows, or it is greater than the matrix's size, in which case it will have as many rows as the original).
r_or_cIf this is 'r' or 'R', then put some rows in the first data set and some in the second; of 'c' or 'C', split columns into first and second data sets.
Returns
An array of two apop_data sets. If one is empty then a NULL pointer will be returned in that position. For example, for a data set of 50 rows, apop_data **out = apop_data_split(data, 100, 'r') sets out[0] = apop_data_copy(data) and out[1] = NULL.
  • When splitting at a row, the text is also split.
  • more pointer is ignored.
  • The apop_data->vector is taken to be the -1st element of the matrix.
  • Weights will be preserved. If splitting by rows, then the top and bottom parts of the weights vector will be assigned to the top and bottom parts of the main data set. If splitting by columns, identical copies of the weights vector will be assigned to both parts.
  • Data is copied, so you may want to call apop_data_free(in) after this.
apop_data* apop_data_stack ( apop_data m1,
apop_data m2,
char  posn,
char  inplace 
)

Put the first data set either on top of or to the left of the second data set.

The fn returns a new data set, meaning that at the end of this function, until you apop_data_free() the original data sets, you will be taking up twice as much memory. Plan accordingly.

For the opposite operation, see apop_data_split.

Parameters
m1the upper/rightmost data set (default = NULL)
m2the second data set (default = NULL)
posnIf 'r', stack rows of m1's matrix above rows of m2's
if 'c', stack columns of m1's matrix to left of m2's
(default = 'r')
inplaceIf 'y', use apop_matrix_realloc and apop_vector_realloc to modify m1 in place; see the caveats on those function. Otherwise, allocate a new vector, leaving m1 unmolested. (default='n')
Returns
The stacked data, either in a new apop_data set or m1
Exceptions
out->error=='a'Allocation error.
out->error=='d'Dimension error; couldn't make a complete copy.
  • If m1 or m2 are NULL, this returns a copy of the other element, and if both are NULL, you get NULL back (except if m2 is NULL and inplace is 'y', where you'll get the original m1 pointer back)
  • Text is handled as you'd expect: If 'r', one set of text is stacked on top of the other [number of columns must match]; if 'c', one set of text is set next to the other [number of rows must match].
  • more is ignored.
  • If stacking rows on rows, the output vector is the input vectors stacked accordingly. If stacking columns by columns, the output vector is just a copy of the vector of m1 and m2->vector doesn't appear in the output at all.
  • The same rules for dealing with the vector(s) hold for the vector(s) of weights.
  • Names are a copy of the names for m1, with the names for m2 appended to the row or column list, as appropriate.
  • This function uses the Designated initializers syntax for inputs.
apop_data* apop_data_to_bins ( apop_data indata,
apop_data binspec,
int  bin_count,
char  close_top_bin 
)

Create a histogram from data by putting data into bins of fixed width.

Parameters
indataThe input data that will be binned. This is copied and the copy will be modified.
close_top_binNormally, a bin covers the range from the point equal to its minimum to points strictly less than the minimum plus the width. if 'y', then the top bin includes points less than or equal to the upper bound. This solves the problem of displaying histograms where the top bin is just one point.
binspecThis is an apop_data set with the same number of columns as indata. If you want a fixed size for the bins, then the first row of the bin spec is the bin width for each column. This allows you to specify a width for each dimension, or specify the same size for all with something like:
bin_countIf you don't provide a bin spec, I'll provide this many evenly-sized bins. Default: $\sqrt(N)$.
1 Apop_row(indata, 0, firstrow);
2 apop_data *binspec = apop_data_copy(firstrow);
3 gsl_matrix_set_all(binspec->matrix, 10); //bins of size 10 for all dim.s
4 apop_data_to_bins(indata, binspec);
The presumption is that the first bin starts at zero in all cases. You can add a second row to the spec to give the offset for each dimension. Default: NULL. if no binspec and no binlist, then a grid with offset equal to the min of the column, and bin size such that it takes $\sqrt{N}$ bins to cover the range to the max element.
Returns
A pointer to a binned apop_data set. If you didn't give me a binspec, then I attach one to the output set as a page named <binspec>, so you can snap a second data set to the same grid using
1 apop_data_to_bins(first_set, NULL);
2 apop_data_to_bins(second_set, apop_data_get_page(first_set, "<binspec>"));

The text segment, if any, is not binned. I use apop_data_pmf_compress as the final step in the binning, and that does respect the text segment.

Here is a sample program highlighting the difference between apop_data_to_bins and apop_data_pmf_compress .

#define _GNU_SOURCE
#include <apop.h>
#ifdef Testing
#define printdata(dataset) ;
#else
#define printdata(dataset) \
printf("\n-----------\n\n"); \
apop_data_print(dataset);
#endif
int main(){
apop_data_fill(d, 1, 2, 3, 3, 1, 2);
apop_text_fill(d, "A", "A", "A", "A", "A", "B");
asprintf(&d->names->title, "Original data set");
printdata(d);
//binned, where bin ends are equidistant but not necessarily in the data
apop_data *binned = apop_data_to_bins(d, NULL);
asprintf(&binned->names->title, "Post binning");
printdata(binned);
assert(apop_sum(binned->weights)==6);
assert(fabs(//equal distance between bins
(apop_data_get(binned, 1, -1) - apop_data_get(binned, 0, -1))
- (apop_data_get(binned, 2, -1) - apop_data_get(binned, 1, -1))) < 1e-5);
//compressed, where the data is as in the original, but weights
//are redome to accommodate repeated observations.
asprintf(&d->names->title, "Post compression");
printdata(d);
assert(apop_sum(d->weights)==6);
apop_data *firstrow = Apop_r(d, 0); //1A
assert(fabs(apop_p(firstrow, d_as_pmf) - 2./6 < 1e-5));
}
apop_data* apop_data_to_dummies ( apop_data d,
int  col,
char  type,
int  keep_first,
char  append,
char  remove 
)

A utility to make a matrix of dummy variables. You give me a single vector that lists the category number for each item, and I'll produce a matrix with a single one in each row in the column specified.

After that, you have to decide what to do with the new matrix and the original data column.

  • You can manually join the dummy data set with your main data, e.g.:
    1 apop_data *dummies = apop_data_to_dummies(main_regression_vars, .col=8, .type='t');
    2 apop_data_stack(main_regression_vars, dummies, 'c', .inplace='y');
  • The .remove='y' option specifies that I should use apop_data_rm_columns to remove the column used to generate the dummies. Implemented only for type=='d'.
  • By specifying .append='y' or .append='e' I will run the above two lines for you. Your apop_data pointer will not change, but its matrix element will be reallocated (via apop_data_stack).
  • By specifying .append='i', I will place the matrix of dummies in place, immediately after the data column you had specified. You will probably use this with .remove='y' to replace the single column with the new set of dummy columns. Bear in mind that if there are two or more dummy columns (which there probably are if you are bothering to use this function), subsequent column numbers will change.
  • If .append='i' and you asked for a text column, I will append to the end of the table, which is equivalent to append='e'.
Parameters
dThe data set with the column to be dummified (No default.)
colThe column number to be transformed; -1==vector (default = 0)
type'd'==data column, 't'==text column. (default = 't')
keep_firstif zero, return a matrix where each row has a one in the (column specified MINUS ONE). That is, the zeroth category is dropped, the first category has an entry in column zero, et cetera. If you don't know why this is useful, then this is what you need. If you know what you're doing and need something special, set this to one and the first category won't be dropped. (default = 0)
appendIf 'e' or 'y', append the dummy grid to the end of the original data matrix. If 'i', insert in place, immediately after the original data column. (default = 'n')
removeIf 'y', remove the original data or text column. (default = 'n')
Returns
An apop_data set whose matrix element is the one-zero matrix of dummies. If you used .append, then this is the main matrix. Also, I add a page named "\<categories for your_var\>" giving a reference table of names and column numbers (where your_var is the appropriate column heading).
Exceptions
out->error=='a'allocation error
out->error=='d'dimension error
apop_data* apop_data_to_factors ( apop_data data,
char  intype,
int  incol,
int  outcol 
)

Convert a column of text or numbers into a column of numeric factors, which you can use for a multinomial probit/logit, for example.

If you don't run this on your data first, apop_probit and apop_logit default to running it on the vector or (if no vector) zeroth column of the matrix of the input apop_data set, because those models need a list of the unique values of the dependent variable.

Parameters
dataThe data set to be modified in place. (No default. If NULL, returns NULL and a warning)
intypeIf 't', then incol refers to text, otherwise ('d' is a good choice) refers to the vector or matrix. Default = 't'.
incolThe column in the text that will be converted. -1 is the vector. Default = 0.
outcolThe column in the data set where the numeric factors will be written (-1 means the vector). Default = 0.

For example:

1 apop_data *d = apop_query_to_mixed_data("mmt", "select 1, year, color from data");
2 apop_data_to_factors(d);

Notice that the query pulled a column of ones for the sake of saving room for the factors. It reads column zero of the text, and writes it to column zero of the matrix.

Another example:

1 apop_data *d = apop_query_to_data("mmt", "select type, year from data");
2 apop_data_to_factors(d, .intype='d', .incol=0, .outcol=0);

Here, the type column is converted to sequential integer factors and those factors overwrite the original data. Since a reference table is added as a second page of the apop_data set, you can recover the original values as needed.

Returns
A table of the factors used in the code. This is an apop_data set with only one column of text. Also, I add a page named "<categories for your_var>" giving a reference table of names and column numbers (where your_var is the appropriate column heading) use apop_data_get_factor_names to retrieve that table.
Exceptions
out->error=='a'allocation error.
out->error=='d'dimension error.
  • If the vector or matrix you wanted to write to is NULL, I will allocate it for you.
  • This function uses the Designated initializers syntax for inputs.
apop_data* apop_data_transpose ( apop_data in,
char  transpose_text,
char  inplace 
)

Transpose the matrix and text elements of the input data set, including the row/column names.

The vector and weights elements of the input data set are completely ignored (but see also apop_vector_to_matrix, which can convert a vector to a 1 X N matrix.) If copying, these other elements won't be present; if .inplace='y', it is up to you to handle these not-transposed elements correctly.

Parameters
inThe input apop_data set. If NULL, I return NULL. Default is NULL.
transpose_textIf 'y', then also transpose the text element. Default is 'y'.
inplaceIf 'y', transpose the input in place; if 'n', produce a transposed copy, leaving the original untouched. Due to how gsl_matrix_transpose_memcpy works, a copy will still be made, then copied to the original location. Default is 'y'.
Returns
If inplace=='n', a newly alloced apop_data set, with the appropriately transposed matrix and/or text. The vector and weights elements will be NULL. If transpose_text='n', then the text element of the output set will also be NULL.
if inplace=='y', a pointer to the original data set, with matrix and (if transpose_text='y') text transposed and vector and weights left in place untouched.
  • Row names are written to column names of the output matrix, text, or both (whichever is not empty in the input).
  • If only the matrix or only the text have names, then the one set of names is written to the row names of the output.
  • If both matrix column names and text column names are present, text column names are lost.
  • if you have a gsl_matrix with no names or text, you may prefer to use gsl_matrix_transpose_memcpy.
int apop_db_close ( char  vacuum)

Closes the database on disk. If you opened the database with apop_db_open(NULL), then this is basically optional.

Parameters
vacuum'v': vacuum—do clean-up to minimize the size of the database on disk.
'q': Don't bother; just close the database. (default = 'q')
Returns
0 on OK, nonzero on error.
int apop_db_open ( char const *  filename)

If you want to use a database on the hard drive instead of memory, then call this once and only once before using any other database utilities.

If you want a disposable database which you won't use after the program ends, don't bother with this function.

The trade-offs between an on-disk database and an in-memory db are as one would expect: memory is faster, but is destroyed when the program exits. SQLite includes a command line utility (sqlite3) which let you ask queries of a database on disk, which may be useful for debugging. There are also some graphical front-ends; just ask your favorite search engine for SQLite GUI.

MySQL users: either set the environment variable APOP_DB_ENGINE=mysql or set apop_opts.db_engine = 'm'.

The Apophenia package assumes you are only using a single SQLite database at a time. You can use the SQL attach function to load other databases, or see this blog post for further suggestions and sample code.

When you are done doing your database manipulations, be sure to call apop_db_close if writing to disk.

Parameters
filenameThe name of a file on the hard drive on which to store the database. If NULL, then the database will be kept in memory (in which case, the other database functions will call this function for you and you don't need to bother).
Returns
0: everything OK
1: database did not open.
apop_data* apop_db_to_crosstab ( char *  tabname,
char *  r1,
char *  r2,
char *  datacol 
)

Give the name of a table in the database, and names of three of its columns: the x-dimension, the y-dimension, and the data. the output is a 2D matrix with rows indexed by r1 and cols by r2.

Parameters
tabnameThe database table I'm querying. Anything that will work inside a from clause is OK, such as a subquery in parens.
r1The column of the data set that will indicate the rows of the output crosstab
r2The column of the data set that will indicate the columns of the output crosstab
datacolThe column of the data set holding the data for the cells of the crosstab
  • If the query to get data to fill the table (select r1, r2, datacol from tabname) returns an empty data set, then I will return a NULL data set and if apop_opts.verbosity >= 1 print a warning.
  • This setup presumes that there is one value for each (row, col) coordinate in the data. You may want an aggregate instead. There are two ways to do this, both of which hack the fact that this function runs a simple select query to generate the data. One is to specify an ad hoc table to pull from:
1 apop_data * out = apop_db_to_crosstab("(select row, col, count(*) ct from base_data group by row, col)", "row", "col", "ct");

The other is to use the fact that the table name will be at the end of the query, so you can add conditions to the table:

1 apop_data * out = apop_db_to_crosstab("base_data group by row, col", "row", "col", "count(*)");
2 //which will expand to "select row, col, count(*) from base_data group by row, col"
See also
apop_crosstab_to_db
Exceptions
out->error='n'Name not found error.
out->error='q'Query returned an empty table (which might mean that it just failed).
void apop_estimate_parameter_tests ( apop_model est)

For many, it is a knee-jerk reaction to a parameter estimation to test whether each individual parameter differs from zero. This function does that.

Parameters
estThe apop_model, which includes pre-calculated parameter estimates, var-covar matrix, and the original data set.

Returns nothing. At the end of the routine, est->info->more includes a set of t-test values: p value, confidence (=1-pval), t statistic, standard deviation, one-tailed Pval, one-tailed confidence.

apop_data* apop_f_test ( apop_model est,
apop_data contrast 
)

Runs an F-test specified by q and c. Your best bet is to see the chapter on hypothesis testing in Modeling With Data, p 309. It will tell you that:

\[{N-K\over q} {({\bf Q}'\hat\beta - {\bf c})' [{\bf Q}' ({\bf X}'{\bf X})^{-1} {\bf Q}]^{-1} ({\bf Q}' \hat\beta - {\bf c}) \over {\bf u}' {\bf u} } \sim F_{q,N-K},\]

and that's what this function is based on.

Parameters
estan apop_model that you have already calculated. (No default)
contrastThe matrix ${\bf Q}$ and the vector ${\bf c}$, where each row represents a hypothesis. (Defaults: if matrix is NULL, it is set to the identity matrix with the top row missing. If the vector is NULL, it is set to a zero matrix of length equal to the height of the contrast matrix. Thus, if the entire apop_data set is NULL or omitted, we are testing the hypothesis that all but $\beta_1$ are zero.)
Returns
An apop_data set with a few variants on the confidence with which we can reject the joint hypothesis.
Todo:
There should be a way to get OLS and GLS to store $(X'X)^{-1}$. In fact, if you did GLS, this is invalid, because you need $(X'\Sigma X)^{-1}$, and I didn't ask for $\Sigma$.
  • There are two approaches to an $F$-test: the ANOVA approach, which is typically built around the claim that all effects but the mean are zero; and the more general regression form, which allows for any set of linear claims about the data. If you send a NULL contrast set, I will generate the set of linear contrasts that are equivalent to the ANOVA-type approach. Readers of {Modeling with Data}, note that there's a bug in the book that claims that the traditional ANOVA approach also checks that the coefficient for the constant term is also zero; this is not the custom and doesn't produce the equivalence presented in that and other textbooks.
Exceptions
out->error='a'Allocation error.
out->error='d'dimension-matching error.
out->error='i'matrix inversion error.
out->error='m'GSL math error.
double apop_generalized_harmonic ( int  N,
double  s 
)

Calculate $\sum_{n=1}^N {1\over n^s}$

  • There are no doubt efficient shortcuts do doing this, but I use brute force. [Though Knuth's Art of Programming v1 doesn't offer anything, which is strong indication of nonexistence.] To speed things along, I save the results so that they can just be looked up should you request the same calculation.
  • If N is zero or negative, return NaN. Notify the user if apop_opts.verbosity >=1

For example:

#include <apop.h>
int main(){
double out = apop_generalized_harmonic(270, 0.0);
assert (out == 270);
out = apop_generalized_harmonic(370, -1.0);
assert (out == 370*371/2);
out = apop_generalized_harmonic(12, -1.0);
assert (out == 12*13/2);
}
apop_data* apop_jackknife_cov ( apop_data in,
apop_model model 
)

Give me a data set and a model, and I'll give you the jackknifed covariance matrix of the model parameters.

The basic algorithm for the jackknife (with many details glossed over): create a sequence of data sets, each with exactly one observation removed, and then produce a new set of parameter estimates using that slightly shortened data set. Then, find the covariance matrix of the derived parameters.

Jackknife or bootstrap? As a broad rule of thumb, the jackknife works best on models that are closer to linear. The worse a linear approximation does (at the given data), the worse the jackknife approximates the variance.

Sample usage:

1 apop_data_show(apop_jackknife_cov(your_data, your_model));
Parameters
inThe data set. An apop_data set where each row is a single data point.
modelAn apop_model, that will be used internally by apop_estimate.
Exceptions
out->error=='n'NULL input data.
Returns
An apop_data set whose matrix element is the estimated covariance matrix of the parameters.
See also
apop_bootstrap_cov
double apop_kl_divergence ( apop_model from,
apop_model to,
int  draw_ct,
gsl_rng *  rng 
)

Kullback-Leibler divergence.

This measure of the divergence of one distribution from another has the form $ D(p,q) = \sum_i \ln(p_i/q_i) p_i $. Notice that it is not a distance, because there is an asymmetry between $p$ and $q$, so one can expect that $D(p, q) \neq D(q, p)$.

Parameters
fromthe $p$ in the above formula. (No default; must not be NULL)
tothe $q$ in the above formula. (No default; must not be NULL)
draw_ctIf I do the calculation via random draws, how many? (Default = 1e5)
rngA gsl_rng. If NULL or number of threads is greater than 1, I'll take care of the RNG; see apop_rng_get_thread. (Default = NULL)

This function can take empirical histogram-type models (apop_pmf) or continuous models like apop_loess or apop_normal.

If there is a PMF (I'll try from first, under the presumption that you are measuring the divergence of data from an observed data distribution), then I'll step through it for the points in the summation.

  • If you have two empirical distributions, that they must be synced: if $p_i>0$ but $q_i=0$, then the function returns GSL_NEGINF. If apop_opts.verbose >=1 I print a message as well.

If neither distribution is a PMF, then I'll take draw_ct random draws from to and evaluate at those points.

  • Set apop_opts.verbose = 3 for observation-by-observation info.
long double apop_linear_constraint ( gsl_vector *  beta,
apop_data constraint,
double  margin 
)

This is designed to be called from within the constraint method of your apop_model. Just write the constraint vector+matrix and this will do the rest. See the outline page for detailed discussion on setting contrasts.

Parameters
betaThe proposed vector about to be tested. No default, must not be NULL.
constraintA vector/matrix pair [v | m1 m2 ... mn] where each row is interpreted as a less-than inequality: $v < m1x1+ m2x2 + ... + mnxn$. For example, say your constraints are $3 < 2x + 4y - 7z$ and $y$ is positive, i.e. $0 < y$. Allocate and fill the matrix representing these two constraints via:
1 apop_data *constr = apop_data_falloc((2,2,3), 3, 2, 4, 7,
2  0, 0, 1, 0);
. Default: each elements is greater than zero. E.g., for three parameters:
1 apop_data *constr = apop_data_falloc((3,3,3), 0, 1, 0, 0,
2  0, 0, 1, 0,
3  0, 0, 0, 1);
marginIf zero, then this is a >= constraint, otherwise I will return a point this amount within the borders. You could try GSL_DBL_EPSILON, which is the smallest value a double can hold, or something like 1e-3. Default = 0.

return The penalty = the distance between beta and the closest point that meets the constraints. If the constraint is not met, this beta is shifted by margin (Euclidean distance) to meet the constraints.

  • This function uses the Designated initializers syntax for inputs. todo The apop_linear_constraint function doesn't check for odd cases like coplanar constraints.
int apop_matrix_is_positive_semidefinite ( gsl_matrix *  m,
char  semi 
)

Test whether the input matrix is positive semidefinite.

A covariance matrix will always be PSD, so this function can tell you whether your matrix is a valid covariance matrix.

Consider the 1x1 matrix in the upper left of the input, then the 2x2 matrix in the upper left, on up to the full matrix. If the matrix is PSD, then each of these has a positive determinant. This function thus calculates $N$ determinants for an $N$x $N$ matrix.

Parameters
mThe matrix to test. If NULL, I will return zero—not PSD.
semiIf anything but 's', check for positive definite, not semidefinite. (default 's')

See also apop_matrix_to_positive_semidefinite, which will change the input to something PSD.

void apop_matrix_normalize ( gsl_matrix *  data,
const char  row_or_col,
const char  normalization 
)

Normalize each row or column in the given matrix, one by one.

Basically just a convenience fn to iterate through the columns or rows and run apop_vector_normalize for you.

Parameters
dataThe data set to normalize.
row_or_colEither 'r' or 'c'.
normalizationsee apop_vector_normalize.
gsl_matrix* apop_matrix_realloc ( gsl_matrix *  m,
size_t  newheight,
size_t  newwidth 
)

This function will resize a gsl_matrix to a new height or width.

Data in the matrix will be retained. If the new height or width is smaller than the old, then data in the later rows/columns will be cropped away (in a non–memory-leaking manner). If the new height or width is larger than the old, then new cells will be filled with garbage; it is your responsibility to zero out or otherwise fill new rows/columns before use.

Warning I: Using this function is basically bad form—especially when used in a for loop that adds a column each time. A large number of reallocs can take a noticeable amount of time. You are thus encouraged to make an effort to determine the size of your data beforehand.

Warning II: The gsl_matrix is a versatile struct that can represent submatrices and other cuts from parent data. I can't deal with those, and check for such situations beforehand. [Besides, resizing a portion of a parent matrix makes no sense.]

Parameters
mThe already-allocated matrix to resize. If you give me NULL, this becomes equivalent to gsl_matrix_alloc
newheight,newwidthThe height and width you'd like the matrix to be.
Returns
m, now resized
gsl_matrix* apop_matrix_rm_columns ( gsl_matrix *  in,
int *  drop 
)

Delete columns from a matrix.

This is done via copying, so if you have an exceptionally large data set, you're better off producing the matrix in the perfect form directly.

Parameters
inthe gsl_matrix to be subsetted
Returns
a gsl_matrix with the specified columns removed. If you ask me to remove no columns, I'll return a copy of the original. If you ask me to remove all columns, I'll return NULL.
Parameters
dropan array of ints. If use[7]==1, then column seven will be cut from the output.
apop_data* apop_matrix_to_data ( gsl_matrix *  m)

Deprecated; please do not use. Just use a compound literal:

1 //Given:
2 gsl_vector *v;
3 gsl_matrix *m;
4 
5 // Then this form wraps the elements into \ref apop_data structs. Note that
6 // these are not pointers: they're automatically allocated and therefore
7 // the extra memory use for the wrapper is cleaned up on exit from scope.
8 
9 apop_data *dv = &(apop_data){.vector=v};
10 apop_data *dm = &(apop_data){.matrix=m};
11 
12 apop_data *v_dot_m = apop_dot(dv, dm);
13 
14 //Here is a macro to hide C's ugliness:
15 #define As_data(...) (&(apop_data){__VA_ARGS__})
16 
17 apop_data *v_dot_m2 = apop_dot(As_data(.vector=v), As_data(.matrix=m));
18 
19 //The wrapped object is an automatically-allocated structure pointing to the
20 //original data. If it needs to persist or be separate from the original,
21 //make a copy:
22 apop_data *dm_copy = apop_data_copy(As_data(.vector=v, .matrix=m));
double apop_matrix_to_positive_semidefinite ( gsl_matrix *  m)

First, this function passes tests, but is under development.

It takes in a matrix and converts it to the `closest' positive semidefinite matrix.

Parameters
mOn input, any matrix; on output, a positive semidefinite matrix.
Returns
the distance between the original and new matrices.

Adapted from the R Matrix package's nearPD, which is Copyright (2007) Jens Oehlschlägel [and is GPL].

apop_model* apop_ml_impute ( apop_data d,
apop_model mvn 
)

Impute the most likely data points to replace NaNs in the data, and insert them into the given data. That is, the data set is modified in place.

How it works: this uses the machinery for apop_model_fix_params. The only difference is that this searches over the data space and takes the parameter space as fixed, while basic fix params model searches parameters and takes data as fixed. So this function just does the necessary data-parameter switching to make that happen.

Parameters
dThe data set. It comes in with NaNs and leaves entirely filled in.
mvnA parametrized apop_model from which you expect the data was derived. if NULL, then I'll use the Multivariate Normal that best fits the data after listwise deletion.
Returns
An estimated apop_ml_impute_model. Also, the data input will be filled in and ready to use.
apop_data* apop_model_draws ( apop_model model,
int  count,
apop_data draws 
)

Make a set of random draws from a model and write them to an apop_data set.

Parameters
modelThe model from which draws will be made. Must already be prepared and/or estimated.
countThe number of draws to make. If draw_matrix is not NULL, then this is ignored and count=draw_matrix->matrix->size1. default=1000.
drawsIf not NULL, a pre-allocated data set whose matrix element will be filled with draws.
Returns
An apop_data set with the matrix filled with size draws. If draw_matrix!=NULL, then return a pointer to it.
Exceptions
out->error=='m'Input model isn't good for making draws: it is NULL, or m->dsize=0.
out->error=='s'You gave me a draws matrix, but its size is less than the size of a single draw from the data, model->dsize.
out->error=='d'Trouble drawing from the distribution for at least one row. That row is set to all NAN.
  • Prints a warning if you send in a non-NULL apop_data set, but its matrix element is NULL, when apop_opts.verbose>=1.
  • See also apop_draw, which makes a single draw.

Here is a two-line program to draw a different set of ten Standard Normals on every run (provided runs are more than a second apart):

#include <apop.h>
#include <time.h>
int main(){
apop_opts.rng_seed = time(NULL);
.count=10,
)
);
}
apop_model* apop_model_fix_params ( apop_model model_in)

Produce a model based on another model, but with some of the parameters fixed at a given value.

You will send me the model whose parameters you want fixed, with the parameters element set as follows. For the fixed parameters, simply give the values to which they will be fixed. Set the free parameters to NaN.

For example, here is a Binomial distribution with a fixed $n=30$ but $p_1$ allowed to float freely:

1 apop_model *bi30 = apop_model_fix_params(apop_model_set_parameters(apop_binomial, 30, GSL_NAN));
2 Apop_model_add_group(bi30, apop_mle, .starting_pt=(double[]){.5}); // The Binomial doesn't like the default
3  // starting point of 1.
4 apop_model *out = apop_estimate(your_data, bi30);

The output is an apop_model that can be estimated, Bayesian updated, et cetera.

  • Rather than using this model, you may simply want a now-filled-in copy of the original model. Use apop_model_fix_params_get_base to retrieve the original model's parameters.
  • The estimate method always uses an MLE, and it never calls the base model's estimate method.
  • If the input model has MLE-style settings attached, I'll use them for the estimate method. Otherwise, I'll set my own.
  • If the parameter input has non-NaN values at the free parameters, then I'll use those as the starting point for any MLE search; the defaults for the variables without fixed values starts from 1 as usual.
  • I do check the more pointer of the parameters for additional pages and NaNs on those pages.

Here is a sample program. It produces a few thousand draws from a Multivariate Normal distribution, and then tries to recover the means given a var/covar matrix fixed at the correct variance.

#include <apop.h>
int main(){
size_t ct = 5e4;
//set up the model & params
apop_data *params = apop_data_alloc(2,2,2);
apop_data_fill(params, 8, 1, 0.5,
2, 0.5, 1);
pvm->parameters = apop_data_copy(params);
pvm->dsize = 2;
apop_data *d = apop_model_draws(pvm, ct);
//set up and estimate a model with fixed covariance matrix but free means
gsl_vector_set_all(pvm->parameters->vector, GSL_NAN);
apop_model *e1 = apop_estimate(d, mep1);
//compare results, via assert for the test suite, or on-screen for human use.
#ifdef Testing
assert(apop_vector_distance(params->vector, e1->parameters->vector)<1e-2);
#else
printf("original params: ");
apop_vector_show(params->vector);
printf("estimated params: ");
#endif
}
Parameters
model_inThe base model
Returns
a model that can be used like any other, with the given params fixed or free.
apop_model* apop_model_fix_params_get_base ( apop_model fixed_model)

The apop_model_fix_params function produces a model that has only the non-fixed parameters of the model. After estimation of the fixed-parameter model, this function fills the parameters element of the base model and returns a pointer to the base model.

apop_data* apop_model_hessian ( apop_data data,
apop_model model,
double  delta 
)

Numerically estimate the matrix of second derivatives of the parameter values. The math is simply a series of re-evaluations at small differential steps. [Therefore, it may be expensive to do this for a very computationally-intensive model.]

Parameters
dataThe data at which the model was estimated
modelThe model, with parameters already estimated
deltathe step size for the differentials. The current default is around 1e-3.
Returns
The matrix of estimated second derivatives at the given data and parameter values.
apop_model* apop_model_metropolis ( apop_data d,
gsl_rng *  rng,
apop_model m 
)

Use Metropolis-Hastings Markov chain Monte Carlo to make draws from the given model.

The basic storyline is that draws are made from a proposal distribution, and the likelihood of your model given your data and the drawn parameters evaluated. At each step, a new set of proposal parameters are drawn, and if either they are more likely than the previous set the new proposal is accepted as the next step, else with probability (prob of new params)/(prob of old params), they are accepted as the next step anyway. Otherwise the last accepted proposal is repeated.

The output is an apop_pmf model with a data set listing the draws that were accepted, including those repetitions. The output model is modified so that subsequent draws are one more step from the Markov chain, via apop_model_metropolis_draw.

  • If a proposal fails to meet the constraint element of the model you input, then the proposal is thrown out and a new one selected. By the default proposal distribution, this is not mathematically correct (it breaks detailed balance), and values near the constraint will be oversampled. The output model will have outmodel->error=='c'. It is up to you to decide whether the resulting distribution is good enough for your purposes or whether to take the time to write a custom proposal and step function to accommodate the constraint.

Attach an apop_mcmc_settings group to your model to specify the proposal distribution, burnin, and other details of the search. See the apop_mcmc_settings documentation for details.

  • The default proposal includes an adaptive step: you specify a target accept rate (default: .35), and if the accept rate is currently higher the variance of the proposals is widened to explore more of the space; if the accept rate is currently lower the variance is narrowed to stay closer to the last accepted proposal. Technically, this breaks ergodicity of the Markov chain, but the consensus seems to be that this is not a serious problem. If it does concern you, you can set the base_adapt_fn in the apop_mcmc_settings group to a do-nothing function, or one that damps its adaptation as $n\to\infty$.
  • Note the gibbs_chunks element of the apop_mcmc_settings group. If you set gibbs_chunks='a', all parameters are drawn as a set, and accepted/rejected as a set. The variances are adapted at an identical rate. If you set gibbs_chunks='i', then each scalar parameter is assigned its own proposal distribution, which is adapted at its own pace. With gibbs_chunks='b' (the default), then each of the vector, matrix, and weights of your model's parameters are drawn/accepted/adapted as a group (and so on to additional chunks if your model has ->more pages). This works well for complex models which naturally break down into subsets of parameters.

Each chunk counts as a step in the Markov chain. Therefore, if there are several chunks, you can expect chunks to repeat from step to step. If you want a draw after cycling through all chunks, try using apop_model_metropolis_draw, which has that behavior.

Parameters
dThe apop_data set used for evaluating the likelihood of a proposed parameter set.
rngA gsl_rng, probably allocated via apop_rng_alloc. (Default: an RNG from apop_rng_get_thread)
mThe apop_model from which parameters are being drawn. (No default; must not be NULL)
  • If the likelihood model no parameters, I will allocate them. That means you can use one of the stock models that ship with Apophenia. If I need to run the model's prep routine to get the size of the parameters, then I'll make a copy of the likelihood model, run prep, and then allocate parameters for that copy of a model.
  • On exit, the parameters element of your likelihood model has the last accepted parameter proposal.
  • If you set apop_opts.verbose=2 or greater, I will report the accept rate of the M-H sampler. It is a common rule of thumb to select a proposal so that this is between 20% and 50%. Set apop_opts.verbose=3 to see the stream of proposal points, their likelihoods, and the acceptance odds. You may want to set apop_opts.log_file=fopen("yourlog", "w") first.
Returns
A modified apop_pmf model representing the results of the search. It has a specialized draw method that returns another step from the Markov chain with each draw.
Exceptions
out->error='c'Proposal was outside of a constraint; see above.
apop_data* apop_model_numerical_covariance ( apop_data data,
apop_model model,
double  delta 
)

Produce the covariance matrix for the parameters of an estimated model via the derivative of the score function at the parameter. I.e., I find the second derivative via apop_model_hessian , and take the negation of the inverse.

I follow Efron and Hinkley in using the estimated information matrix—the value of the information matrix at the estimated value of the score—not the expected information matrix that is the integral over all possible data. See Pawitan 2001 (who cribbed a little off of Efron and Hinkley) or Klemens 2008 (who directly cribbed off of both) for further details.

Parameters
dataThe data by which your model was estimated
modelA model whose parameters have been estimated.
deltaThe differential by which to step for sampling changes. (default currently = 1e-3)
Returns
A covariance matrix for the data. Also, if the data does not have a "Covariance" page, I'll set it to the result as well [i.e., I won't overwrite an existing covar].

This function uses the Designated initializers syntax for inputs.

long double apop_multivariate_gamma ( double  a,
int  p 
)

The multivariate generalization of the Gamma distribution.

\[ \Gamma_p(a)= \pi^{p(p-1)/4}\prod_{j=1}^p \Gamma\left[ a+(1-j)/2\right]. \]

Because $\Gamma(x)$ is undefined for $x\in\{0, -1, -2, ...\}$, this function returns NAN when $a+(1-j)/2$ takes on one of those values.

See also apop_multivariate_lngamma, which is more numerically stable in most cases.

long double apop_multivariate_lngamma ( double  a,
int  p 
)

The log of the multivariate generalization of the Gamma; see also apop_multivariate_gamma.

int apop_name_add ( apop_name n,
char const *  add_me,
char  type 
)

Adds a name to the apop_name structure. Puts it at the end of the given list.

Parameters
nAn existing, allocated apop_name structure.
add_meA string. If NULL, do nothing; return -1.
type'r': add a row name
'c': add a column name
't': add a text category name
'h': add a title (or a header. 't' is taken).
'v': add (or overwrite) the vector name
Returns
Returns the number of rows/cols/depvars after you have added the new one. But if add_me is NULL, return -1.
apop_name* apop_name_alloc ( void  )

Allocates a name structure

Returns
An allocated, empty name structure. In the very unlikely event that malloc fails, return NULL.
apop_name* apop_name_copy ( apop_name in)

Copy one apop_name structure to another. That is, all data is duplicated. Usage:

1 apop_name *out = apop_name_copy(in);
\param in    the input names
\return       a structure that this function will allocate and fill
int apop_name_find ( const apop_name n,
const char *  name,
const char  type 
)

Finds the position of an element in a list of names.

The function uses case-insensitive search (POSIX's strcasecmp).

Parameters
nthe apop_name object to search.
namethe name you seek; see above.
type'c', 'r', or 't'. Default is 'c'.
Returns
The position of findme. If 'c', then this may be -1, meaning the vector name. If not found, returns -2. On error, e.g. name==NULL, returns -2.
void apop_name_free ( apop_name free_me)

Erases an apop_name structure.

void apop_name_print ( apop_name n)

Prints the given list of names to STDOUT. Useful for debugging, and not much else.

Parameters
nThe apop_name structure
void apop_name_stack ( apop_name n1,
apop_name nadd,
char  type1,
char  typeadd 
)

Append one list of names to another.

Notice that if the first list is empty, then this is a copy function. If the second is NULL, it is a no-op.

Parameters
n1The first set of names (no default, must not be NULL)
naddThe second set of names, which will be appended after the first. (no default, if NULL, a no-op)
type1Either 'c', 'r', 't', or 'v' stating whether you are merging the columns, rows, or text. If 'v', then ignore typeadd and just overwrite the target vector name with the source name. (default = 'r')
typeaddEither 'c', 'r', 't', or 'v' stating whether you are merging the columns, rows, or text. If 'v', then overwrite the target with the source vector name. (default = type1)
apop_data* apop_rake ( char const *  margin_table,
char *const *  var_list,
int  var_ct,
char const *  all_vars,
char *const *  contrasts,
int  contrast_ct,
char const *  structural_zeros,
int  max_iterations,
double  tolerance,
char const *  count_col,
int  run_number,
char const *  init_table,
char const *  init_count_col,
double  nudge,
char const *  table_name 
)

Fit a log-linear model via iterative proportional fitting, aka raking.

Raking has many uses. The Modeling with Data blog presents a series of discussions of uses of raking, including some worked examples.

Or see Wikipedia for an overview of Log linear models, aka Poisson regressions. One approach toward log-linear modeling is a regression form; let there be four categories, A, B, C, and D, from which we can produce a model positing, for example, that cell count is a function of a form like $g_1(A) + g_2(BC) + g_3(CD)$. In this case, we would assign a separate coefficient to every possible value of A, every possible value of (B, C), and every value of (C, D). Raking is the technique that searches for that large set of parameters.

The combinations of categories that are considered to be relevant are called contrasts, after ANOVA terminology of the 1940s.

The other constraint on the search are structural zeros, which are values that you know can never be non-zero, due to field-specific facts about the variables. For example, U.S. Social Security payments are available only to those age 65 or older, so "age <65 and gets_soc_security=1" is a structural zero.

Because there is one parameter for every combination, there may be millions of parameters to estimate, so the search to find the most likely value requires some attention to technique. For over half a century, the consensus method for searching has been raking, which iteratively draws each category closer to the mean in a somewhat simple manner (this was first developed circa 1940 and had to be feasible by hand), but which is guaranteed to eventually arrive at the maximum likelihood estimate for all cells.

Another complication is that the table is invariably sparse. One can easily construct tables with millions of cells, but the corresponding data set may have only a few thousand observations.

This function uses the database to resolve the sparseness problem. It constructs a query requesting all combinations of categories the could possibly be non-zero after raking, given all of the above constraints. Then, raking is done using only that subset. This means that the work is done on a number of cells proportional to the number of data points, not to the full cross of all categories. Set apop_opts.verbose to 2 or greater to show the query on stderr.

  • One could use raking to generate `fully synthetic' data: start with observation-level data in a margin table. Begin the raking with a starting data set of all-ones. Then rake until the all-ones set transforms into something that conforms to the margins and (if any) structural zeros. You now have a data set which matches the marginal totals but does not use any other information from the observation-level data. If you do not specify an .init_table, then an all-ones default table will be used.
  • Set apop_opts.verbose=3 to see the intermediate tables at the end of each round of raking.
  • If you want all cells to have nonzero value, then you can do that via pre-processing:
    1 apop_query("update data_table set count_col = 1e-3 where count_col = 0");
Parameters
margin_tableThe name of the table in the database to use for calculating the margins. The table should have one observation per row. No default. (This used to be called table_name; that name is now deprecated.)
var_listThe full list of variables to search. A list of strings, e.g., (char *[]){"var1", "var2", ..., "var15"}
var_ctThe count of the full list of variables to search.
all_varsdeprecated.
contrastsThe contrasts describing your model. Like the all_vars input, each contrast is a pipe-delimited list of variable names. No default.
contrast_ctThe number of contrasts in the list of contrasts. No default.
structural_zerosa SQL clause indicating combinations that can never take a nonzero value. This will go into a where clause, so anything you could put there is OK, e.g. "age <65 and gets_soc_security=1 or age <15 and married=1". Your margin data is not checked for structural zeros. Default: no structural zeros.
max_iterationsNumber of rounds of raking at which the algorithm halts. Default: 1000.
toleranceI calculate the change for each cell from round to round; if the largest cell change is smaller than this, I stop. Default: 1e-5.
count_colThis column gives the count of how many observations are represented by each row. If NULL, ech row represents one person. Default: NULL.
run_numberBecause I write intermediate tables to the database, I need a way to distinguish distinct runs should you be threading several runs at once. If you aren't running several instances simultaneously, don't worry about this; if you are, do supply a value, since it's hard for the function to supply one in a race-proof manner. Default: internally-maintained values.
init_tableThe default is to initially set all table elements to one and then rake from there. This is effectively the `fully synthetic' approach, which uses only the information in the margins and derives the data set closest to the all-ones data set that is consistent with the margins. Care is taken to maintan sparsity in this case. If you specify an init_table, then I will get the initial cell counts from it. Default: the fully-synthetic approach, using a starting point of an all-ones grid.
init_count_colThe column in init_table with the cell counts.
nudgeThere is a common hack of adding a small value to every zero entry, because a zero entry will always scale to zero, while a small value could eventually scale to anything. Recall that this function works on sparse sets, so I first filter out those cells that could possibly have a nonzero value given the observations, then I add nudge to any zero cells within that subset.
table_nameDeprecated; replaced with margin_table.
Returns
An apop_data set where every row is a single combination of variable values and the weights vector gives the most likely value for each cell.
Exceptions
out->error='i'Input was somehow wrong.
out->error='c'Raking did not converge, reached max. iteration count.
  • The interface is still beta, and subject to change—notably, handling of text categories will soon be added.
int apop_regex ( const char *  string,
const char *  regex,
apop_data **  substrings,
const char  use_case 
)

A convenience function for regular expression searching

  • There are three common flavors of regular expression: Basic, Extended, and Perl-compatible (BRE, ERE, PCRE). I use EREs, as per the specs of your C library, which should match POSIX's ERE specification.

For example, "p.val" will match "P value", "p.value", "p values" (and even "tempeval", so be careful).

If you give a non-NULL address in which to place a table of paren-delimited substrings, I'll return them as a row in the text element of the returned apop_data set. I'll return all the matches, filling the first row with substrings from the first application of your regex, then filling the next row with another set of matches (if any), and so on to the end of the string. Useful when parsing a list of items, for example.

Parameters
stringThe string to search (no default)
regexThe regular expression (no default)
substringsParens in the regex indicate that I should return matching substrings. Give me the address of an apop_data* set, and I will allocate and fill the text portion with matches. Default= NULL, meaning do not return substrings (even if parens exist in the regex). If no match, return an empty apop_data set, so output->textsize[0]==0.
use_caseShould I be case sensitive, 'y' or 'n'? (default = 'n', which is not the POSIX default.)
Returns
Count of matches found. 0 == no match. substrings may be allocated and filled if needed.
  • If apop_opts.stop_on_warning='n' returns -1 on error (e.g., regex NULL or didn't compile).
  • If strings==NULL, I return 0—no match—and if substrings is provided, set it to NULL.
  • Here is the test function. Notice that the substring-pulling function call passes &subs, not plain subs. Also, the non-match has a zero-length blank in subs->text[0][1].
#include <apop.h>
int main(){
char string1[] = "Hello. I am a string.";
assert(apop_regex(string1, "hell"));
apop_data *subs;
apop_regex(string1, "(e).*I.*(xxx)*(am)", .substrings = &subs);
//apop_data_show(subs);
assert(!strcmp(subs->text[0][0], "e"));
assert(!strlen(subs->text[0][1]));
assert(!strcmp(subs->text[0][2], "am"));
//Split a comma-delimited list, throwing out white space.
//Notice that the regex includes only one instance of a non-comma blob
//ending in a non-space followed by a comma, but the function keeps
//applying it until the end of string.
char string2[] = " one, two , three ,four";
apop_regex(string2, " *([^,]*[^ ]) *(,|$) *", &subs);
assert(!strcmp(*subs->text[0], "one"));
assert(!strcmp(*subs->text[1], "two"));
assert(!strcmp(*subs->text[2], "three"));
assert(!strcmp(*subs->text[3], "four"));
//Get a parenthetical. For EREs, \( \) match plain parens in the text.
char string3[] = " one (but secretly, two)";
apop_regex(string3, "(\\([^)]*\\))", &subs);
assert(!strcmp(*subs->text[0], "(but secretly, two)"));
//NULL input string ==> no-op.
int match_count = apop_regex(NULL, " *([^,]*[^ ]) *(,|$) *", &subs);
assert(!match_count);
assert(!subs);
}
  • Each set of matches will be one row of the output data. E.g., given the regex ([A-Za-z])([0-9]), the column zero of outdata will hold letters, and column one will hold numbers. Use apop_data_transpose to reverse this so that the letters are in outdata->text[0] and numbers in outdata->text[1].
double apop_rng_GHgB3 ( gsl_rng *  r,
double *  a 
)

RNG from a Generalized Hypergeometric type B3.

Devroye uses this as the base for many of his distribution-generators, including the Waring.

  • If one of the inputs is <=0, error. Returns GSL_NAN if the function doesn't stop.
void apop_settings_copy_group ( apop_model outm,
apop_model inm,
char *  copyme 
)

Copy a settings group with the given name from the second model to the first. (i.e., the arguments are in memcpy order).

You probably won't need this often—just use apop_model_copy.

Parameters
outmThe model that will receive a copy of the settings group.
inmThe model that will provide the original.
copymeThe string naming the group. For example, for an apop_mcmc_settings group, this would be "apop_mcmc".
Exceptions
outm->error=='s'Error copying settings group.
int apop_table_exists ( char const *  name,
char  remove 
)

Check for the existence of a table, and maybe delete it.

Recreating a table which already exists can cause errors, so it is good practice to check for existence first. Also, this is the stylish way to delete a table, since just calling "drop table" will give you an error if the table doesn't exist.

Parameters
namethe table name (no default)
remove'd' ==>delete table so it can be recreated in main.
'n' ==>no action. Return result so program can continue. (default)
Returns
0 = table does not exist
1 = table was found, and if remove=='d', has been deleted
  • In the SQLite engine, this function considers table views to be tables.
  • If apop_opts.stop_on_warn='n', returns -1 on errors.
double apop_test ( double  statistic,
char *  distribution,
double  p1,
double  p2,
char  tail 
)

This is a convenience function to do the lookup of a given statistic along a given distribution. You give me a statistic, its (hypothesized) distribution, and whether to use the upper tail, lower tail, or both. I will return the odds of a Type I error given the model—in statistician jargon, the $p$-value. [Type I error: odds of rejecting the null hypothesis when it is true.]

For example,

1 apop_test(1.3);

will return the density of the standard Normal distribution that is more than 1.3 from zero. If this function returns a small value, we can be confident that the statistic is significant. Or,

1 apop_test(1.3, "t", 10, tail='u');

will give the appropriate odds for an upper-tailed test using the $t$-distribution with 10 degrees of freedom (e.g., a $t$-test of the null hypothesis that the statistic is less than or equal to zero).

Several more distributions are supported; see below.

  • For a two-tailed test (the default), this returns the density outside the range. I'll only do this for symmetric distributions.
  • For an upper-tail test ('u'), this returns the density above the cutoff
  • For a lower-tail test ('l'), this returns the density below the cutoff
Parameters
statisticThe scalar value to be tested.
distributionThe name of the distribution; see below.
p1The first parameter for the distribution; see below.
p2The second parameter for the distribution; see below.
tail'u' = upper tail; 'l' = lower tail; anything else = two-tailed. (default = two-tailed)
Returns
The odds of a Type I error given the model (the $p$-value).

Here is a list of distributions you can use, and their parameters.

"normal" or "gaussian"

  • p1=mu, p2=sigma
  • default (0, 1)

"lognormal"

  • p1=mu, p2=sigma
  • default (0, 1)
  • Remember, mu and sigma refer to the Normal one would get after exponentiation
  • One-tailed tests only

"uniform"

  • p1=lower edge, p2=upper edge
  • default (0, 1)
  • two-tailed tests are run relative to the center, (p1+p2)/2.

"t"

  • p1=df
  • no default

"chi squared", "chi", "chisq":

  • p1=df
  • no default
  • One-tailed tests only; default='u' ( $p$-value for typical cases)

"f"

  • p1=df1, p2=df2
  • no default
  • One-tailed tests only
apop_data* apop_test_fisher_exact ( apop_data intab)

Run the Fisher exact test on an input contingency table.

Returns
An apop_data set with two rows:
"probability of table": Probability of the observed table for fixed marginal totals.
"p value": Table p-value. The probability of a more extreme table, where `extreme' is in a probabilistic sense.
Exceptions
out->error=='p'Processing error in the test.
  • If there are processing errors, these values will be NaN. For example:
#include <apop.h>
int main() {
/* This test is thanks to Nick Eriksson, who sent it to me in the form of a bug report. */
apop_data * testdata = apop_data_falloc((2, 3),
30, 50, 45,
34, 12, 17 );
assert(fabs(apop_data_get(t2,1) - 0.0001761) < 1e-6);
}
int apop_text_add ( apop_data in,
const size_t  row,
const size_t  col,
const char *  fmt,
  ... 
)

Add a string to the text element of an apop_data set. If you send me a NULL string, I will write the value of apop_opts.nan_string in the given slot. If there is already something in that slot, that string is freed, preventing memory leaks.

Parameters
inThe apop_data set, that already has an allocated text element.
rowThe row
colThe col
fmtThe text to write.
...You can use a printf-style fmt and follow it with the usual variables to fill in.
Returns
0=OK, -1=error (probably out-of-bounds)
  • UTF-8 or ASCII text is correctly handled.
  • Apophenia follows a general rule of not reallocating behind your back: if your text matrix is currently of size (3,3) and you try to put an item in slot (4,4), then I display an error rather than reallocating the text matrix.
  • Resizing a text matrix is annoying in C, so note that apop_text_alloc will reallocate to a new size if you need. For example, this code will fill the diagonals of the text array with a message, resizing as it goes:
  • The string added is a copy (via asprintf), not a pointer to the input(s).
  • If there had been a string at the grid point you are writing to, the old one is effectively lost when the new one is placed. So, I free the old string to prevent leaks if necessary. Remember this if you had other pointers aliasing that string, in which case you may as well avoid this function and just use asprintf(&(your_dataset->text[row][col]), "your string").
1 apop_data *list = (something already allocated.);
2 for (int n=0; n < 10; n++){
3  apop_text_alloc(list, n+1, n+1);
4  apop_text_add(list, n, n, "This is cell (%i, %i)", n, n);
5 }
apop_data* apop_text_alloc ( apop_data in,
const size_t  row,
const size_t  col 
)

This allocates an array of strings and puts it in the text element of an apop_data set.

If the text element already exists, then this is effectively a realloc function, reshaping to the size you specify.

Parameters
inAn apop_data set. It's OK to send in NULL, in which case an apop_data set with NULL matrix and vector elements is returned.
rowthe number of rows of text.
colthe number of columns of text.
Returns
A pointer to the relevant apop_data set. If the input was not NULL, then this is a repeat of the input pointer.
Exceptions
out->error=='a'Allocation error.
void apop_text_free ( char ***  freeme,
int  rows,
int  cols 
)

Free a matrix of chars* (i.e., a char***). This is the form of the text element of the apop_data set, so you can use this for:

1 apop_text_free(yourdata->text, yourdata->textsize[0], yourdata->textsize[1]);

This is what apop_data_free uses internally.

char* apop_text_paste ( apop_data const *  strings,
char *  between,
char *  before,
char *  after,
char *  between_cols,
apop_fn_riip  prune,
void *  prune_parameter 
)

Join together a list or array of strings, with optional separators between the strings.

Parameters
stringsAn apop_data set with a grid of text to be combined into a single string
betweenThe text to put in between the rows of the table, such as ", ". (Default is a single space: " ")
beforeThe text to put at the head of the string. For the query example, this would be .before="select ". (Default: NULL)
afterThe text to put at the tail of the string. For the query example, .after=" from data_table". (Default: NULL)
between_colsThe text to insert between columns of text. See below for an example (Default is set to equal .between)
pruneIf you don't want to use the entire text set, you can provide a function to indicate which elements should be pruned out. Some examples:
1 //Just use column 3
2 int is_not_col_3(apop_data *indata, int row, int col, void *ignore){
3  return col!=3;
4 }
5 
6 //Jump over blanks as if they don't exist.
7 int is_blank(apop_data *indata, int row, int col, void *ignore){
8  return strlen(indata->text[row][col])==0;
9 }
Parameters
prune_parameterA void pointer to pass to your prune function.
Returns
A single string with the elements of the strings table joined as per your specification. Allocated by the function, to be freed by you if desired.
  • If the table of strings is NULL or has no text, I will print only the .before and .after parts with nothing in between.
  • if apop_opts.verbose >=3, then print the pasted text to stderr.
  • This function uses the Designated initializers syntax for inputs.

The sample snippet generates the SQL for a query using a list of column names (where the query begins with select , ends with from datatab, and has commas in between each element), re-processes the same list to produce the head of an HTML table, then produces the body of the table with the query result (pasting the trs and tds into the right places).

#include <apop.h>
int main(){
apop_query("create table datatab(name, age, sex);"
"insert into datatab values ('Alex', 23, 'm');"
"insert into datatab values ('Alex', 32, 'f');"
"insert into datatab values ('Michael', 41, 'f');"
"insert into datatab values ('Michael', 14, 'm');");
apop_data *cols = apop_text_alloc(NULL, 3, 1);
apop_text_add(cols, 0, 0, "name");
apop_text_add(cols, 1, 0, "age");
apop_text_add(cols, 2, 0, "sex");
char *query= apop_text_paste(cols, .before="select ", .between=", ");
apop_data *d = apop_query_to_text("%s from datatab", query);
char *html_head = apop_text_paste(cols, .before="<table><tr><td>",
.between="</td><td>", .after="</tr>\n<tr><td>");
char *html_table = apop_text_paste(d, .before=html_head, .after="</td></tr></table>\n",
.between="</tr>\n<tr><td>", .between_cols="</td><td>");
FILE *outfile = fopen("yourdata.html", "w"); fprintf(outfile, "%s", html_table);
fclose(outfile);
}
apop_data* apop_text_unique_elements ( const apop_data d,
size_t  col 
)

Give me a column of text, and I'll give you a sorted list of the unique elements. This is basically running "select distinct * from datacolumn", but without the aid of the database.

Parameters
dAn apop_data set with a text component
colThe text column you want me to use.
Returns
An apop_data set with a single sorted column of text, where each unique text input appears once.
See also
apop_vector_unique_elements
apop_model* apop_update ( apop_data data,
apop_model prior,
apop_model likelihood,
gsl_rng *  rng 
)

Take in a prior and likelihood distribution, and output a posterior distribution.

This function first checks a table of conjugate distributions for the pair you sent in. If the names match the table, then the function returns a closed-form model with updated parameters. If the parameters aren't in the table of conjugate priors/likelihoods, then it uses Markov Chain Monte Carlo to sample from the posterior distribution, and then outputs a histogram model for further analysis. Notably, the histogram can be used as the input to this function, so you can chain Bayesian updating procedures.

  • If the prior distribution has a p or log_likelihood element, then I use apop_model_metropolis to generate the posterior.
  • If the prior does not have a p or log_likelihood but does have a draw element, then I make draws from the prior and weight them by the p given by the likelihood distribution. This is not a rejection sampling method, so the burnin is ignored.

Here are the conjugate distributions currently defined:

Prior Likelihood Notes
Beta Binomial
Beta Bernoulli
Exponential Gamma Gamma likelihood represents the distribution of $\lambda^{-1}$, not plain $\lambda$
Normal Normal Assumes prior with fixed $\sigma$; updates distribution for $\mu$
Gamma Poisson Uses sum and size of the data
  • The conjugate table is stored using a vtable; see Registering new methods in vtables for details. The typedef new functions must conform to and the hash used for lookups are:
1 typedef apop_model *(*apop_update_type)(apop_data *, apop_model , apop_model);
2 #define apop_update_hash(m1, m2) ((size_t)(m1).draw + (size_t)((m2).log_likelihood ? (m2).log_likelihood : (m2).p)*33)
Parameters
dataThe input data, that will be used by the likelihood function (default = NULL.)
priorThe prior apop_model. If the system needs to estimate the posterior via MCMC, this needs to have a log_likelihood or p method. (No default, must not be NULL.)
likelihoodThe likelihood apop_model. If the system needs to estimate the posterior via MCMC, this needs to have a log_likelihood or p method (ll preferred). (No default, must not be NULL.)
rngA gsl_rng, already initialized (e.g., via apop_rng_alloc). (default: an RNG from apop_rng_get_thread)
Returns
an apop_model struct representing the posterior, with updated parameters.

Here is a test function that compares the output via conjugate table and via Metropolis-Hastings sampling:

#include <apop.h>
//For the test suite.
void distances(gsl_vector *v1, gsl_vector *v2, double tol){
double error = apop_vector_distance(v1, v2, .metric='m');
double updated_size = apop_vector_sum(v1);
Apop_stopif(error/updated_size > tol, exit(1), 0, "The error is %g, which is too big.", error/updated_size);
}
int main(){
gsl_rng *r = apop_rng_alloc(2468);
double binom_start = 0.6;
double beta_start_a = 0.3;
double beta_start_b = 0.5;
double n = 4000;
//First, the easy estimation using the conjugate distribution table.
apop_model *beta = apop_model_set_parameters(apop_beta, beta_start_a, beta_start_b);
apop_model *updated = apop_update(.prior= beta, .likelihood=bin,.rng=r);
//Now estimate via MCMC.
//Requires a one-parameter binomial, with n fixed,
//and a data set of n data points with the right p.
apop_data *bin_draws = apop_data_falloc((1,2), n*(1-binom_start), n*binom_start);
bin = apop_model_fix_params(bcopy);
Apop_settings_add_group(beta, apop_mcmc, .burnin=.2, .periods=1e5);
apop_model *out_h = apop_update(bin_draws, beta, bin, NULL);
apop_model *out_beta = apop_estimate(out_h->data, apop_beta);
//Finally, we can compare the conjugate and Gibbs results:
distances(updated->parameters->vector, out_beta->parameters->vector, 0.01);
//The apop_update function used apop_model_metropolis to generate
//a batch of draws. Let's use apop_model_metropolis_draw to get draws.
int i, draws = 1.3e5;
apop_data *d = apop_data_alloc(draws, 1);
for(i=0; i < draws; i ++)
apop_draw(apop_data_ptr(d, i, 0), r, out_h);
distances(updated->parameters->vector, drawn->parameters->vector, 0.02);
}
double apop_vector_cov ( const gsl_vector *  v1,
const gsl_vector *  v2,
const gsl_vector *  weights 
)

Find the sample covariance of a pair of vectors, with an optional weighting. This only makes sense if the weightings are identical, so the function takes only one weighting vector for both.

Parameters
v1,v2The data vectors
weightsThe weight vector. Default: equal weights for all elements.
Returns
The sample covariance
double apop_vector_mean ( gsl_vector const *  v,
gsl_vector const *  weights 
)

Find the mean, weighted or unweighted.

Parameters
vThe data vector
weightsThe weight vector. Default: assume equal weights.
Returns
The weighted mean
int gsl_vector* apop_vector_moving_average ( gsl_vector *  v,
size_t  bandwidth 
)

Return a new vector that is the moving average of the input vector.

Parameters
vThe input vector, unsmoothed
bandwidthThe number of elements to be smoothed.
void apop_vector_normalize ( gsl_vector *  in,
gsl_vector **  out,
const char  normalization_type 
)

This function will normalize a vector, either such that it has mean zero and variance one, or ranges between zero and one, or sums to one.

Parameters
inA gsl_vector which you have already allocated and filled. NULL input gives NULL output. (No default)
outIf normalizing in place, NULL. If not, the address of a gsl_vector. Do not allocate. (default = NULL.)
normalization_type'p': normalized vector will sum to one. E.g., start with a set of observations in bins, end with the percentage of observations in each bin. (the default)
'r': normalized vector will range between zero and one. Replace each X with (X-min) / (max - min).
's': normalized vector will have mean zero and variance one. Replace each X with $(X-\mu) / \sigma$, where $\sigma$ is the sample standard deviation.
'm': normalize to mean zero: Replace each X with $(X-\mu)$

Example

1 #include <apop.h>
2 
3 int main(void){
4 gsl_vector *in, *out;
5 
6 in = gsl_vector_calloc(3);
7 gsl_vector_set(in, 1, 1);
8 gsl_vector_set(in, 2, 2);
9 
10 printf("The original vector:\n");
11 apop_vector_show(in);
12 
13 apop_vector_normalize(in, &out, 's');
14 printf("Standardized with mean zero and variance one:\n");
15 apop_vector_show(out);
16 
17 apop_vector_normalize(in, &out, 'r');
18 printf("Normalized range with max one and min zero:\n");
19 apop_vector_show(out);
20 
21 apop_vector_normalize(in, NULL, 'p');
22 printf("Normalized into percentages:\n");
23 apop_vector_show(in);
24 }
double* apop_vector_percentiles ( gsl_vector *  data,
char  rounding 
)

Returns an array of size 101, where returned_vector[95] gives the value of the 95th percentile, for example. Returned_vector[100] is always the maximum value, and returned_vector[0] is always the min (regardless of rounding rule).

Parameters
dataa gsl_vector of data. (No default, must not be NULL.)
roundingThis will either be 'u', 'd', or 'a'. Unless your data is exactly a multiple of 101, some percentiles will be ambiguous. If 'u', then round up (use the next highest value); if 'd' (or anything else), round down to the next lowest value; if 'a', take the mean of the two nearest points. If 'u' or 'a', then you can say "5% or more of the sample is below \c returned_vector[5]"; if 'd' or 'a', then you can say "5% or more of the sample is above returned_vector[5]". (Default = 'd'.)
  • You may eventually want to free() the array returned by this function.
  • This function uses the Designated initializers syntax for inputs.
gsl_vector* apop_vector_realloc ( gsl_vector *  v,
size_t  newheight 
)

This function will resize a gsl_vector to a new length.

Data in the vector will be retained. If the new height is smaller than the old, then data at the end of the vector will be cropped away (in a non–memory-leaking manner). If the new height is larger than the old, then new cells will be filled with garbage; it is your responsibility to zero out or otherwise fill them before use.

Warning I: Using this function is basically bad form—especially when used in a for loop that adds an element each time. A large number of reallocs can take a noticeable amount of time. You are thus encouraged to make an effort to determine the size of your data beforehand.

Warning II: The gsl_vector is a versatile struct that can represent subvectors, matrix columns and other cuts from parent data. I can't deal with those, and check for such situations beforehand. [Besides, resizing a portion of a parent matrix makes no sense.]

Parameters
vThe already-allocated vector to resize. If you give me NULL, this is equivalent to gsl_vector_alloc
newheightThe height you'd like the vector to be.
Returns
v, now resized
apop_data* apop_vector_to_data ( gsl_vector *  v)

Deprecated; please do not use. Just use a compound literal, as in the code sample in the documentation for apop_matrix_to_data.

gsl_vector* apop_vector_unique_elements ( const gsl_vector *  v)

Give me a vector of numbers, and I'll give you a sorted list of the unique elements. This is basically running "select distinct datacol from data order by datacol", but without the aid of the database.

Parameters
va vector of items
Returns
a sorted vector of the distinct elements that appear in the input.
  • NaNs appear at the end of the sort order.
See also
apop_text_unique_elements
double apop_vector_var ( gsl_vector const *  v,
gsl_vector const *  weights 
)

Find the sample variance of a vector, weighted or unweighted.

  • This uses (n-1) in the denominator of the sum; i.e., it corrects for the bias introduced by using $\bar x$ instead of $\mu$.
  • At the moment, there is no var_pop function. Just multiply this by (n-1)/n if you need that.
Parameters
vThe data vector
weightsThe weight vector. If NULL, assume equal weights.
Returns
The weighted sample variance.
  • Apophenia tries to be smart about reading the weights. If weights sum to one, then the system uses w->size as the number of elements, and returns the usual sum over $n-1$. If weights > 1, then the system uses the total weights as $n$. Thus, you can use the weights as standard weightings or to represent elements that appear repeatedly.

Variable Documentation

apop_opts_type apop_opts

Here are where the options are initially set. See the apop_opts_type documentation for details.

Autogenerated by doxygen on Sun Oct 26 2014 (Debian 0.999b+ds3-2).