viodotcom · tokahuke · Jan 23, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,10 +14,10 @@ crate-type = ["staticlib"]
 
 [dependencies]
 ppca = { path = "./ppca" }
-pyo3 = { version = "0.18.3", features = ["extension-module"] }
-numpy = { version = "0.18.0", features = ["nalgebra"] }
+pyo3 = { version = "0.20.2", features = ["extension-module"] }
+numpy = { version = "0.20.0", features = ["nalgebra"] }
 bincode = "1.3.3"
-rayon = "1.7.0"
-nalgebra = "0.32.2"
+rayon = "1.8.1"
+nalgebra = "0.32.3"
 rand = "0.8.5"
 rand_distr = "0.4.3"
diff --git a/ppca/src/dataset.rs b/ppca/src/dataset.rs
@@ -1,5 +1,5 @@
 use bit_vec::BitVec;
-use nalgebra::DVector;
+use nalgebra::{DMatrix, DVector};
 use rayon::prelude::*;
 use serde_derive::{Deserialize, Serialize};
 use std::{ops::Index, sync::Arc};
@@ -11,6 +11,8 @@ use crate::utils::Mask;
 pub struct MaskedSample {
     pub(crate) data: DVector<f64>,
     pub(crate) mask: Mask,
+    pub(crate) prior_mean: Option<DVector<f64>>,
+    pub(crate) prior_cov: Option<DMatrix<f64>>,
 }
 
 impl MaskedSample {
@@ -24,14 +26,31 @@ impl MaskedSample {
     /// Creates a masked sample from data and a mask. The value is considered missing if its index
     /// in the masked is set to `false`.
     pub fn new(data: DVector<f64>, mask: Mask) -> MaskedSample {
-        MaskedSample { data, mask }
+        MaskedSample {
+            data,
+            mask,
+            prior_mean: None,
+            prior_cov: None,
+        }
     }
 
     /// Creates a sample without any masked values.
     pub fn unmasked(data: DVector<f64>) -> MaskedSample {
         MaskedSample {
             mask: Mask::unmasked(data.len()),
             data,
+            prior_cov: None,
+            prior_mean: None,
+        }
+    }
+
+    /// Associates a prior to this sample:
+    pub fn with_prior(self, prior_mean: DVector<f64>, prior_cov: DMatrix<f64>) -> Self {
+        MaskedSample {
+            mask: self.mask,
+            data: self.data,
+            prior_mean: Some(prior_mean),
+            prior_cov: Some(prior_cov),
         }
     }
 
@@ -96,13 +115,13 @@ pub struct Dataset {
     /// The weights associated with each sample. Use this only if you are using the PPCA as a
     /// component of a greater EM scheme (or otherwise know what you are doing). Else, let the
     /// package set it automatically to 1.
-    pub weights: Vec<f64>,
+    pub weights: Arc<Vec<f64>>,
 }
 
 impl From<Vec<MaskedSample>> for Dataset {
     fn from(value: Vec<MaskedSample>) -> Self {
         Dataset {
-            weights: vec![1.0; value.len()],
+            weights: Arc::new(vec![1.0; value.len()]),
             data: Arc::new(value),
         }
     }
@@ -152,7 +171,7 @@ impl Dataset {
     /// Creates a new dataset from a set of masked samples.
     pub fn new(data: Vec<MaskedSample>) -> Dataset {
         Dataset {
-            weights: vec![1.0; data.len()],
+            weights: Arc::new(vec![1.0; data.len()]),
             data: Arc::new(data),
         }
     }
@@ -162,19 +181,49 @@ impl Dataset {
         assert_eq!(data.len(), weights.len());
         Dataset {
             data: Arc::new(data),
-            weights,
+            weights: Arc::new(weights),
         }
     }
 
     /// Creates a new dataset with the same sample, but with different weights. This operation is
     /// cheap, since it does not clone the dataset (it's protected by an `Arc`).
-    pub fn with_weights(&self, weights: Vec<f64>) -> Dataset {
+    pub fn with_weights(self, weights: Vec<f64>) -> Dataset {
         Dataset {
-            data: self.data.clone(),
-            weights,
+            data: self.data,
+            weights: Arc::new(weights),
         }
     }
 
+    pub fn with_priors(
+        mut self,
+        prior_means: Vec<DVector<f64>>,
+        prior_covs: Vec<DMatrix<f64>>,
+    ) -> Dataset {
+        assert_eq!(
+            self.data.len(),
+            prior_means.len(),
+            "Prior mean length mismatch"
+        );
+        assert_eq!(
+            self.data.len(),
+            prior_covs.len(),
+            "Prior cov length mismatch"
+        );
+
+        let mut new_data = vec![];
+        self.data
+            .par_iter()
+            .zip(prior_means)
+            .zip(prior_covs)
+            .map(|((sample, prior_mean), prior_cov)| {
+                sample.clone().with_prior(prior_mean, prior_cov)
+            })
+            .collect_into_vec(&mut new_data);
+
+        self.data = Arc::new(new_data);
+        self
+    }
+
     /// The length of this dataset.
     pub fn len(&self) -> usize {
         self.data.len()
@@ -193,7 +242,7 @@ impl Dataset {
     /// Lists the dimensions which as masked in __all__ samples in this dataset.
     pub fn empty_dimensions(&self) -> Vec<usize> {
         let Some(n_dimensions) = self.data.first().map(|sample| sample.mask().0.len()) else {
-            return vec![]
+            return vec![];
         };
         let new_mask = || BitVec::from_elem(n_dimensions, false);
         let poormans_or = |mut this: BitVec, other: &BitVec| {

diff --git a/ppca/src/mix.rs b/ppca/src/mix.rs
@@ -168,7 +168,7 @@ impl PPCAMix {
         dataset
             .data
             .par_iter()
-            .zip(&dataset.weights)
+            .zip(&*dataset.weights)
             .map(|(sample, &weight)| weight * self.llk_one(sample))
             .sum::<f64>()
     }
@@ -291,7 +291,7 @@ impl PPCAMix {
                 // Log-posteriors for this particular model.
                 let log_posteriors: Vec<_> = log_posteriors
                     .par_iter()
-                    .zip(&dataset.weights)
+                    .zip(&*dataset.weights)
                     .filter(|&(_, &wi)| wi > 0.0)
                     .map(|(lp, &wi)| wi.ln() + lp[i])
                     .collect();
@@ -311,7 +311,7 @@ impl PPCAMix {
                     .collect();
                 let logsum_posteriors =
                     unnorm_posteriors.iter().copied().sum::<f64>().ln() + max_posterior;
-                let dataset = dataset.with_weights(unnorm_posteriors);
+                let dataset = dataset.clone().with_weights(unnorm_posteriors);
 
                 (model.iterate_with_prior(&dataset, prior), logsum_posteriors)
             })

diff --git a/ppca/src/output_covariance.rs b/ppca/src/output_covariance.rs
@@ -130,6 +130,13 @@ impl<'a> OutputCovariance<'a> {
         }
     }
 
+    pub(crate) fn transform_state(&self, transform: &DMatrix<f64>) -> OutputCovariance<'static> {
+        OutputCovariance {
+            isotropic_noise: self.isotropic_noise,
+            transform: Cow::Owned(&*self.transform * transform),
+        }
+    }
+
     pub(crate) fn quadratic_form(&self, x: &DVector<f64>) -> f64 {
         let norm_squared = x.norm_squared();
         let transpose_transformed = self.transform.transpose() * x;

diff --git a/ppca/src/ppca_model.rs b/ppca/src/ppca_model.rs
@@ -143,7 +143,7 @@ impl PPCAModel {
         dataset
             .data
             .par_iter()
-            .zip(&dataset.weights)
+            .zip(&*dataset.weights)
             .map(|(sample, weight)| self.llk_one(sample) * weight)
             .sum()
     }
@@ -174,10 +174,7 @@ impl PPCAModel {
                 .collect::<BitVec>(),
         );
 
-        MaskedSample {
-            data: mask.fillna(&(sampled_state + noise)),
-            mask,
-        }
+        MaskedSample::new(mask.fillna(&(sampled_state + noise)), mask)
     }
 
     /// Sample a full dataset from the PPCA model and masks each entry according to a
@@ -197,13 +194,39 @@ impl PPCAModel {
             return self.uninferred();
         }
 
-        let sub_sample = sample.mask.mask(&(sample.data_vector() - &self.0.mean));
-        let sub_covariance = self.0.output_covariance.masked(&sample.mask);
+        let mean = if let Some(prior_mean) = &sample.prior_mean {
+            &self.0.mean + &*self.0.output_covariance.transform * prior_mean
+        } else {
+            self.0.mean.clone()
+        };
 
-        InferredMasked {
-            model: self.clone(),
-            state: sub_covariance.estimator_transform() * sub_sample,
-            covariance: sub_covariance.estimator_covariance(),
+        let sub_sample = sample.mask.mask(&(sample.data_vector() - mean));
+
+        if let Some(prior_cov) = &sample.prior_cov {
+            let chol = prior_cov
+                .clone()
+                .cholesky()
+                .expect("Prior should be Cholesky-decomposable");
+            let l = chol.l();
+
+            let sub_covariance = self
+                .0
+                .output_covariance
+                .masked(&sample.mask)
+                .transform_state(&l);
+
+            InferredMasked {
+                model: self.clone(),
+                state: &l * sub_covariance.estimator_transform() * sub_sample,
+                covariance: &l * sub_covariance.estimator_covariance() * l.transpose(),
+            }
+        } else {
+            let sub_covariance = self.0.output_covariance.masked(&sample.mask);
+            InferredMasked {
+                model: self.clone(),
+                state: sub_covariance.estimator_transform() * sub_sample,
+                covariance: sub_covariance.estimator_covariance(),
+            }
         }
     }
 
@@ -229,7 +252,7 @@ impl PPCAModel {
         dataset
             .data
             .par_iter()
-            .zip(&dataset.weights)
+            .zip(&*dataset.weights)
             .map(|(sample, &weight)| (self.smooth_one(sample), weight))
             .collect()
     }
@@ -246,7 +269,7 @@ impl PPCAModel {
         dataset
             .data
             .par_iter()
-            .zip(&dataset.weights)
+            .zip(&*dataset.weights)
             .map(|(sample, &weight)| (self.extrapolate_one(sample), weight))
             .collect()
     }
@@ -272,7 +295,7 @@ impl PPCAModel {
         let total_cross_moment = dataset
             .data
             .par_iter()
-            .zip(&dataset.weights)
+            .zip(&*dataset.weights)
             .zip(&inferred)
             .map(|((sample, &weight), inferred)| {
                 let centered_filled = sample.mask.fillna(&(sample.data_vector() - &self.0.mean));
@@ -288,7 +311,7 @@ impl PPCAModel {
                 let total_second_moment = dataset
                     .data
                     .iter()
-                    .zip(&dataset.weights)
+                    .zip(&*dataset.weights)
                     .zip(&inferred)
                     .filter(|((sample, _), _)| sample.mask.0[idx])
                     .map(|((_, &weight), inferred)| weight * inferred.second_moment())
@@ -319,7 +342,7 @@ impl PPCAModel {
         let (square_error, deviations_square_sum, total_deviation, totals) = dataset
             .data
             .par_iter()
-            .zip(&dataset.weights)
+            .zip(&*dataset.weights)
             .zip(&inferred)
             .filter(|((sample, _), _)| !sample.is_empty())
             .map(
@@ -664,9 +687,9 @@ mod test {
     #[test]
     fn test_llk() {
         let model = toy_model();
-        dbg!(model.llk(&Dataset::new(vec![MaskedSample {
-            data: dvector![1.0, 2.0, 3.0],
-            mask: Mask(BitVec::from_elem(3, true)),
-        }])));
+        dbg!(model.llk(&Dataset::new(vec![MaskedSample::new(
+            dvector![1.0, 2.0, 3.0],
+            Mask(BitVec::from_elem(3, true))
+        )])));
     }
 }