diff --git a/.github/workflows/lha_bot.yml b/.github/workflows/lha_bot.yml index 690548ef1..92df8153c 100644 --- a/.github/workflows/lha_bot.yml +++ b/.github/workflows/lha_bot.yml @@ -4,7 +4,7 @@ name: LHA Benchmarks on: push: branches-ignore: - - "*" + - "**" tags: pull_request: types: diff --git a/.github/workflows/lha_bot_rust.yml b/.github/workflows/lha_bot_rust.yml index bdc330d9a..b4ba3aba2 100644 --- a/.github/workflows/lha_bot_rust.yml +++ b/.github/workflows/lha_bot_rust.yml @@ -4,7 +4,7 @@ name: LHA Benchmarks (Rust) on: push: branches-ignore: - - "*" + - "**" tags: pull_request: types: diff --git a/crates/eko/src/lib.rs b/crates/eko/src/lib.rs index 242f404b5..531a88674 100644 --- a/crates/eko/src/lib.rs +++ b/crates/eko/src/lib.rs @@ -1,5 +1,6 @@ //! Interface to the eko Python package. +use ekore::constants::{MAX_ORDER_QCD, MAX_ORDER_QED}; use ekore::harmonics::cache::Cache; use num::Complex; use std::ffi::c_void; @@ -7,71 +8,72 @@ use std::ffi::c_void; pub mod bib; pub mod mellin; -/// Wrapper to pass arguments back to Python. -struct RawCmplx { - re: Vec, - im: Vec, -} - -/// Map tensor with shape (o,d,d) to c-ordered list. +/// Map tensor with shape `(order_QCD, d, d)` into pre-allocated output buffers. /// -/// This is needed for the QCD singlet. -fn unravel(res: Vec<[[Complex; DIM]; DIM]>, order_qcd: usize) -> RawCmplx { - let mut target = RawCmplx { - re: Vec::::new(), - im: Vec::::new(), - }; +/// Only the first `order_qcd` entries along the outer axis are written; +/// remaining buffer slots are left undefined. +fn unravel_qcd( + res: &[[[Complex; DIM]; DIM]; MAX_ORDER_QCD], + order_qcd: usize, + out_re: &mut [f64], + out_im: &mut [f64], +) { + let mut i = 0; for obj in res.iter().take(order_qcd) { - for col in obj.iter().take(DIM) { - for el in col.iter().take(DIM) { - target.re.push(el.re); - target.im.push(el.im); + for col in obj.iter() { + for el in col.iter() { + out_re[i] = el.re; + out_im[i] = el.im; + i += 1; } } } - target } -/// Map tensor with shape (o,o',d,d) to c-ordered list. +/// Map tensor with shape `(order_QCD+1, order_QED+1, d, d)` into pre-allocated output buffers. /// -/// This is needed for the QED singlet and valence. +/// The first `order_qcd + 1` entries along the QCD axis and `order_qed + 1` along the QED axis +/// are written; remaining buffer slots are left undefined. fn unravel_qed( - res: Vec; DIM]; DIM]>>, + res: &[[[[Complex; DIM]; DIM]; MAX_ORDER_QED + 1]; MAX_ORDER_QCD + 1], order_qcd: usize, order_qed: usize, -) -> RawCmplx { - let mut target = RawCmplx { - re: Vec::::new(), - im: Vec::::new(), - }; + out_re: &mut [f64], + out_im: &mut [f64], +) { + let mut i = 0; for obj_ in res.iter().take(order_qcd + 1) { for obj in obj_.iter().take(order_qed + 1) { - for col in obj.iter().take(DIM) { - for el in col.iter().take(DIM) { - target.re.push(el.re); - target.im.push(el.im); + for col in obj.iter() { + for el in col.iter() { + out_re[i] = el.re; + out_im[i] = el.im; + i += 1; } } } } - target } -/// Map tensor with shape (o,o',d) to c-ordered list. +/// Map tensor with shape `(order_QCD+1, order_QED+1)` into pre-allocated output buffers. /// -/// This is needed for the QED non-singlet. -fn unravel_qed_ns(res: Vec>>, order_qcd: usize, order_qed: usize) -> RawCmplx { - let mut target = RawCmplx { - re: Vec::::new(), - im: Vec::::new(), - }; +/// The first `order_qcd + 1` entries along the QCD axis and `order_qed + 1` along the QED axis +/// are written; remaining buffer slots are left undefined. +fn unravel_qed_ns( + res: &[[Complex; MAX_ORDER_QED + 1]; MAX_ORDER_QCD + 1], + order_qcd: usize, + order_qed: usize, + out_re: &mut [f64], + out_im: &mut [f64], +) { + let mut i = 0; for col in res.iter().take(order_qcd + 1) { for el in col.iter().take(order_qed + 1) { - target.re.push(el.re); - target.im.push(el.im); + out_re[i] = el.re; + out_im[i] = el.im; + i += 1; } } - target } /// Integration kernel inside quad. @@ -93,42 +95,46 @@ pub unsafe extern "C" fn rust_quad_ker(u: f64, rargs: *mut c_void) -> f64 { let path = mellin::TalbotPath::new(u, args.logx, is_singlet); let jac = path.jac() * path.prefactor(); let mut c = Cache::new(path.n()); - let mut raw = RawCmplx { - re: Vec::::new(), - im: Vec::::new(), - }; - let n3lo_ad_variation = std::slice::from_raw_parts(args.n3lo_ad_variation, 7) + let n3lo_ad_variation: [u8; 7] = std::slice::from_raw_parts(args.n3lo_ad_variation, 7) .try_into() .unwrap(); + let max_buffer_size = (args.order_qcd + 1) * (args.order_qed + 1) * 16; + let out_re = std::slice::from_raw_parts_mut(args.re_gamma, max_buffer_size); + let out_im = std::slice::from_raw_parts_mut(args.im_gamma, max_buffer_size); + if args.is_ome { if is_singlet { - raw = unravel( - ekore::operator_matrix_elements::unpolarized::spacelike::A_singlet( + unravel_qcd( + &ekore::operator_matrix_elements::unpolarized::spacelike::A_singlet( args.order_qcd, &mut c, args.nf, args.L, ), args.order_qcd, + out_re, + out_im, ); } else { - raw = unravel( - ekore::operator_matrix_elements::unpolarized::spacelike::A_non_singlet( + unravel_qcd( + &ekore::operator_matrix_elements::unpolarized::spacelike::A_non_singlet( args.order_qcd, &mut c, args.nf, args.L, ), args.order_qcd, + out_re, + out_im, ); } } else if is_singlet { if args.order_qed > 0 { let gamma_singlet_qed = ekore::anomalous_dimensions::unpolarized::spacelike::gamma_singlet_qed; - raw = unravel_qed( - gamma_singlet_qed( + unravel_qed( + &gamma_singlet_qed( args.order_qcd, args.order_qed, &mut c, @@ -137,28 +143,32 @@ pub unsafe extern "C" fn rust_quad_ker(u: f64, rargs: *mut c_void) -> f64 { ), args.order_qcd, args.order_qed, + out_re, + out_im, ); } else { let gamma_singlet_qcd = match args.is_polarized { true => ekore::anomalous_dimensions::polarized::spacelike::gamma_singlet_qcd, false => ekore::anomalous_dimensions::unpolarized::spacelike::gamma_singlet_qcd, }; - raw = unravel( - gamma_singlet_qcd( + unravel_qcd( + &gamma_singlet_qcd( args.order_qcd, &mut c, args.nf, n3lo_ad_variation[0..4].try_into().unwrap(), ), args.order_qcd, + out_re, + out_im, ); } } else if args.order_qed > 0 { if is_qed_valence { let gamma_valence_qed = ekore::anomalous_dimensions::unpolarized::spacelike::gamma_valence_qed; - raw = unravel_qed( - gamma_valence_qed( + unravel_qed( + &gamma_valence_qed( args.order_qcd, args.order_qed, &mut c, @@ -167,11 +177,13 @@ pub unsafe extern "C" fn rust_quad_ker(u: f64, rargs: *mut c_void) -> f64 { ), args.order_qcd, args.order_qed, + out_re, + out_im, ); } else { let gamma_ns_qed = ekore::anomalous_dimensions::unpolarized::spacelike::gamma_ns_qed; - raw = unravel_qed_ns( - gamma_ns_qed( + unravel_qed_ns( + &gamma_ns_qed( args.order_qcd, args.order_qed, args.mode0, @@ -181,10 +193,11 @@ pub unsafe extern "C" fn rust_quad_ker(u: f64, rargs: *mut c_void) -> f64 { ), args.order_qcd, args.order_qed, + out_re, + out_im, ); } } else { - // we can not do 1D let gamma_ns_qcd = match args.is_polarized { true => ekore::anomalous_dimensions::polarized::spacelike::gamma_ns_qcd, false => ekore::anomalous_dimensions::unpolarized::spacelike::gamma_ns_qcd, @@ -196,16 +209,16 @@ pub unsafe extern "C" fn rust_quad_ker(u: f64, rargs: *mut c_void) -> f64 { args.nf, n3lo_ad_variation[4..7].try_into().unwrap(), ); - for el in res.iter().take(args.order_qcd) { - raw.re.push(el.re); - raw.im.push(el.im); + for (i, el) in res.iter().take(args.order_qcd).enumerate() { + out_re[i] = el.re; + out_im[i] = el.im; } } // pass on (args.py)( - raw.re.as_ptr(), - raw.im.as_ptr(), + args.re_gamma as *const f64, + args.im_gamma as *const f64, c.n().re, c.n().im, jac.re, @@ -317,6 +330,9 @@ pub struct QuadArgs { pub alphaem_running: bool, // additional param required for N3LO pub n3lo_ad_variation: *const u8, + // pre-allocated gamma output buffers (owned by Python, written by Rust) + pub re_gamma: *mut f64, + pub im_gamma: *mut f64, } /// Empty placeholder function for python callback. @@ -403,5 +419,7 @@ pub unsafe extern "C" fn empty_args() -> QuadArgs { a_half_y: 0, alphaem_running: false, n3lo_ad_variation: [0, 0, 0, 0, 0, 0, 0].as_ptr(), + re_gamma: std::ptr::null_mut(), + im_gamma: std::ptr::null_mut(), } } diff --git a/crates/ekore/src/anomalous_dimensions/polarized/spacelike.rs b/crates/ekore/src/anomalous_dimensions/polarized/spacelike.rs index dd636b0ad..178cb7eb9 100644 --- a/crates/ekore/src/anomalous_dimensions/polarized/spacelike.rs +++ b/crates/ekore/src/anomalous_dimensions/polarized/spacelike.rs @@ -1,6 +1,6 @@ //! The polarized, space-like anomalous dimensions at various couplings power. -use crate::constants::{PID_NSM, PID_NSP, PID_NSV}; +use crate::constants::{MAX_ORDER_QCD, PID_NSM, PID_NSP, PID_NSV}; use crate::harmonics::cache::Cache; use num::complex::Complex; use num::Zero; @@ -9,14 +9,17 @@ pub mod as2; // pub mod as3; /// Compute the tower of the non-singlet anomalous dimensions. +/// +/// Returns an array of shape `(MAX_ORDER_QCD,)`. Only the first `order_qcd` entries +/// are filled; remaining slots are zero. pub fn gamma_ns_qcd( order_qcd: usize, mode: u16, c: &mut Cache, nf: u8, _n3lo_variation: [u8; 3], -) -> Vec> { - let mut gamma_ns = vec![Complex::::zero(); order_qcd]; +) -> [Complex; MAX_ORDER_QCD] { + let mut gamma_ns = [Complex::::zero(); MAX_ORDER_QCD]; gamma_ns[0] = as1::gamma_ns(c, nf); // NLO and beyond if order_qcd >= 2 { @@ -42,19 +45,16 @@ pub fn gamma_ns_qcd( } /// Compute the tower of the singlet anomalous dimension matrices. +/// +/// Returns an array of shape `(MAX_ORDER_QCD, d, d)`. Only the first `order_qcd` +/// entries along the outer axis are filled; remaining slots are zero. pub fn gamma_singlet_qcd( order_qcd: usize, c: &mut Cache, nf: u8, _n3lo_variation: [u8; 4], -) -> Vec<[[Complex; 2]; 2]> { - let mut gamma_S = vec![ - [ - [Complex::::zero(), Complex::::zero()], - [Complex::::zero(), Complex::::zero()] - ]; - order_qcd - ]; +) -> [[[Complex; 2]; 2]; MAX_ORDER_QCD] { + let mut gamma_S = [[[Complex::::zero(); 2]; 2]; MAX_ORDER_QCD]; gamma_S[0] = as1::gamma_singlet(c, nf); // NLO and beyond if order_qcd >= 2 { diff --git a/crates/ekore/src/anomalous_dimensions/unpolarized/spacelike.rs b/crates/ekore/src/anomalous_dimensions/unpolarized/spacelike.rs index 4589b9278..b8984a25d 100644 --- a/crates/ekore/src/anomalous_dimensions/unpolarized/spacelike.rs +++ b/crates/ekore/src/anomalous_dimensions/unpolarized/spacelike.rs @@ -1,7 +1,8 @@ //! The unpolarized, space-like anomalous dimensions at various couplings power. use crate::constants::{ - ED2, EU2, PID_NSM, PID_NSM_D, PID_NSM_U, PID_NSP, PID_NSP_D, PID_NSP_U, PID_NSV, + ED2, EU2, MAX_ORDER_QCD, MAX_ORDER_QED, PID_NSM, PID_NSM_D, PID_NSM_U, PID_NSP, PID_NSP_D, + PID_NSP_U, PID_NSV, }; use crate::harmonics::cache::Cache; use num::complex::Complex; @@ -19,14 +20,17 @@ pub mod as4; /// `n3lo_variation = (ns_p, ns_m, ns_v)` is a list indicating which variation should /// be used. `variation = 1,2` is the upper/lower bound, while any other value /// returns the central (averaged) value. +/// +/// Returns an array of shape `(MAX_ORDER_QCD,)`. Only the first `order_qcd` entries +/// are filled; remaining slots are zero. pub fn gamma_ns_qcd( order_qcd: usize, mode: u16, c: &mut Cache, nf: u8, n3lo_variation: [u8; 3], -) -> Vec> { - let mut gamma_ns = vec![Complex::::zero(); order_qcd]; +) -> [Complex; MAX_ORDER_QCD] { + let mut gamma_ns = [Complex::::zero(); MAX_ORDER_QCD]; gamma_ns[0] = as1::gamma_ns(c, nf); // NLO and beyond if order_qcd >= 2 { @@ -66,19 +70,16 @@ pub fn gamma_ns_qcd( /// `n3lo_variation = (gg, gq, qg, qq)` is a list indicating which variation should /// be used. `variation = 1,2` is the upper/lower bound, while any other value /// returns the central (averaged) value. +/// +/// Returns an array of shape `(MAX_ORDER_QCD, d, d)`. Only the first `order_qcd` +/// entries along the outer axis are filled; remaining slots are zero. pub fn gamma_singlet_qcd( order_qcd: usize, c: &mut Cache, nf: u8, n3lo_variation: [u8; 4], -) -> Vec<[[Complex; 2]; 2]> { - let mut gamma_S = vec![ - [ - [Complex::::zero(), Complex::::zero()], - [Complex::::zero(), Complex::::zero()] - ]; - order_qcd - ]; +) -> [[[Complex; 2]; 2]; MAX_ORDER_QCD] { + let mut gamma_S = [[[Complex::::zero(); 2]; 2]; MAX_ORDER_QCD]; gamma_S[0] = as1::gamma_singlet(c, nf); // NLO and beyond if order_qcd >= 2 { @@ -100,6 +101,10 @@ pub fn gamma_singlet_qcd( /// `n3lo_variation = (ns_p, ns_m, ns_v)` is a list indicating which variation should /// be used. `variation = 1,2` is the upper/lower bound, while any other value /// returns the central (averaged) value. +/// +/// Returns an array of shape `(MAX_ORDER_QCD+1, MAX_ORDER_QED+1)`. The first +/// `order_qcd + 1` entries along the QCD axis and `order_qed + 1` along the QED axis +/// are filled; remaining slots are zero. pub fn gamma_ns_qed( order_qcd: usize, order_qed: usize, @@ -107,9 +112,8 @@ pub fn gamma_ns_qed( c: &mut Cache, nf: u8, n3lo_variation: [u8; 3], -) -> Vec>> { - let col = vec![Complex::::zero(); order_qed + 1]; - let mut gamma_ns = vec![col; order_qcd + 1]; +) -> [[Complex; MAX_ORDER_QED + 1]; MAX_ORDER_QCD + 1] { + let mut gamma_ns = [[Complex::::zero(); MAX_ORDER_QED + 1]; MAX_ORDER_QCD + 1]; // QCD corrections let qcd_mode = match mode { @@ -152,23 +156,18 @@ pub fn gamma_ns_qed( /// `n3lo_variation = (gg, gq, qg, qq, ns_p, ns_m, ns_v)` is a list indicating which variation should /// be used. `variation = 1,2` is the upper/lower bound, while any other value /// returns the central (averaged) value. +/// +/// Returns an array of shape `(MAX_ORDER_QCD+1, MAX_ORDER_QED+1, d, d)`. The first +/// `order_qcd + 1` entries along the QCD axis and `order_qed + 1` along the QED axis +/// are filled; remaining slots are zero. pub fn gamma_singlet_qed( order_qcd: usize, order_qed: usize, c: &mut Cache, nf: u8, n3lo_variation: [u8; 7], -) -> Vec; 4]; 4]>> { - let col = vec![ - [[ - Complex::::zero(), - Complex::::zero(), - Complex::::zero(), - Complex::::zero() - ]; 4]; - order_qed + 1 - ]; - let mut gamma_s = vec![col; order_qcd + 1]; +) -> [[[[Complex; 4]; 4]; MAX_ORDER_QED + 1]; MAX_ORDER_QCD + 1] { + let mut gamma_s = [[[[Complex::::zero(); 4]; 4]; MAX_ORDER_QED + 1]; MAX_ORDER_QCD + 1]; // QCD corrections let gamma_qcd_s = gamma_singlet_qcd(order_qcd, c, nf, n3lo_variation[0..4].try_into().unwrap()); @@ -223,15 +222,18 @@ pub fn gamma_singlet_qed( /// `n3lo_variation = (ns_p, ns_m, ns_v)` is a list indicating which variation should /// be used. `variation = 1,2` is the upper/lower bound, while any other value /// returns the central (averaged) value. +/// +/// Returns an array of shape `(MAX_ORDER_QCD+1, MAX_ORDER_QED+1, d, d)`. The first +/// `order_qcd + 1` entries along the QCD axis and `order_qed + 1` along the QED axis +/// are filled; remaining slots are zero. pub fn gamma_valence_qed( order_qcd: usize, order_qed: usize, c: &mut Cache, nf: u8, n3lo_variation: [u8; 3], -) -> Vec; 2]; 2]>> { - let col = vec![[[Complex::::zero(), Complex::::zero(),]; 2]; order_qed + 1]; - let mut gamma_v = vec![col; order_qcd + 1]; +) -> [[[[Complex; 2]; 2]; MAX_ORDER_QED + 1]; MAX_ORDER_QCD + 1] { + let mut gamma_v = [[[[Complex::::zero(); 2]; 2]; MAX_ORDER_QED + 1]; MAX_ORDER_QCD + 1]; // QCD corrections let gamma_qcd_nsv = gamma_ns_qcd(order_qcd, PID_NSV, c, nf, n3lo_variation); diff --git a/crates/ekore/src/constants.rs b/crates/ekore/src/constants.rs index a66cf71be..a8e1ad923 100644 --- a/crates/ekore/src/constants.rs +++ b/crates/ekore/src/constants.rs @@ -67,6 +67,12 @@ pub const PID_NSM_U: u16 = 10202; /// valence-like non-singlet down-sector |PID|. pub const PID_NSM_D: u16 = 10203; +/// Maximum QCD coupling power implemented. +pub const MAX_ORDER_QCD: usize = 4; + +/// Maximum QED coupling power implemented. +pub const MAX_ORDER_QED: usize = 2; + /// |QED| electric charge combinations. pub struct ChargeCombinations { pub nf: u8, diff --git a/crates/ekore/src/harmonics/cache.rs b/crates/ekore/src/harmonics/cache.rs index 595834017..3c1b2049f 100644 --- a/crates/ekore/src/harmonics/cache.rs +++ b/crates/ekore/src/harmonics/cache.rs @@ -1,12 +1,11 @@ //! Cache harmonic sums for given Mellin N. use num::{complex::Complex, Zero}; -use std::collections::HashMap; use crate::harmonics::{g_functions, w1, w2, w3, w4, w5}; /// List of available elements. -#[derive(Debug, PartialEq, Eq, Hash)] +#[derive(Debug, PartialEq, Eq)] pub enum K { /// $S_1(N)$ S1, @@ -50,12 +49,41 @@ pub enum K { Sm21o, } +impl K { + fn idx(&self) -> usize { + match self { + K::S1 => 0, + K::S2 => 1, + K::S3 => 2, + K::S4 => 3, + K::S5 => 4, + K::S1h => 5, + K::S2h => 6, + K::S3h => 7, + K::S1mh => 8, + K::S2mh => 9, + K::S3mh => 10, + K::G3 => 11, + K::Sm1e => 12, + K::Sm1o => 13, + K::Sm2e => 14, + K::Sm2o => 15, + K::Sm3e => 16, + K::Sm3o => 17, + K::Sm21e => 18, + K::Sm21o => 19, + } + } +} + +const CACHE_SIZE: usize = 20; + /// Hold all cached values. pub struct Cache { /// Mellin N n: Complex, - /// Mapping - m: HashMap>, + /// Flat lookup table indexed by K::idx() + m: [Option>; CACHE_SIZE], } impl Cache { @@ -63,7 +91,7 @@ impl Cache { pub fn new(n: Complex) -> Self { Self { n, - m: HashMap::new(), + m: [None; CACHE_SIZE], } } @@ -74,10 +102,9 @@ impl Cache { /// Retrieve an element. pub fn get(&mut self, k: K) -> Complex { - let val = self.m.get(&k); - // already there? - if let Some(value) = val { - return *value; + let idx = k.idx(); + if let Some(val) = self.m[idx] { + return val; } // compute new let val = match k { @@ -103,7 +130,7 @@ impl Cache { K::Sm21o => w3::Sm21o(self), }; // insert - self.m.insert(k, val); + self.m[idx] = Some(val); val } } @@ -141,6 +168,47 @@ mod tests { assert_approx_eq_cmplx!(f64, m, cmplx!(2., 0.)); } + #[test] + fn test_cache_idx_mapping() { + let all_variants = [ + K::S1, + K::S2, + K::S3, + K::S4, + K::S5, + K::S1h, + K::S2h, + K::S3h, + K::S1mh, + K::S2mh, + K::S3mh, + K::G3, + K::Sm1e, + K::Sm1o, + K::Sm2e, + K::Sm2o, + K::Sm3e, + K::Sm3o, + K::Sm21e, + K::Sm21o, + ]; + // size: number of variants matches CACHE_SIZE + assert_eq!(all_variants.len(), CACHE_SIZE); + + let mut c = Cache::new(cmplx!(2., 0.)); + let mut seen = [false; CACHE_SIZE]; + for v in all_variants { + let idx = v.idx(); + // mapping is continuous: no duplicate indices + assert!(!seen[idx], "duplicate index {idx}"); + seen[idx] = true; + // exercises Cache::get(), panics if idx() >= CACHE_SIZE (size check) + c.get(v); + } + // every slot 0..CACHE_SIZE was covered (no holes) + assert!(seen.iter().all(|&b| b), "index mapping has holes"); + } + #[test] fn test_recursive_harmonic_sum() { const SX: [fn(Complex) -> Complex; 5] = [w1::S1, w2::S2, w3::S3, w4::S4, w5::S5]; diff --git a/crates/ekore/src/lib.rs b/crates/ekore/src/lib.rs index db937583d..6505e6957 100644 --- a/crates/ekore/src/lib.rs +++ b/crates/ekore/src/lib.rs @@ -5,7 +5,7 @@ pub mod anomalous_dimensions; pub mod bib; -mod constants; +pub mod constants; pub mod harmonics; pub mod operator_matrix_elements; pub mod util; diff --git a/crates/ekore/src/operator_matrix_elements/unpolarized/spacelike.rs b/crates/ekore/src/operator_matrix_elements/unpolarized/spacelike.rs index 3c57d33ef..fa87d0441 100644 --- a/crates/ekore/src/operator_matrix_elements/unpolarized/spacelike.rs +++ b/crates/ekore/src/operator_matrix_elements/unpolarized/spacelike.rs @@ -1,4 +1,5 @@ //! The unpolarized, space-like |OME| at various couplings power. +use crate::constants::MAX_ORDER_QCD; use crate::harmonics::cache::Cache; use num::complex::Complex; use num::Zero; @@ -6,32 +7,16 @@ pub mod as1; pub mod as2; /// Compute the tower of the singlet |OME|. +/// +/// Returns an array of shape `(MAX_ORDER_QCD, d, d)`. Only the first `matching_order_qcd` +/// entries along the outer axis are filled; remaining slots are zero. pub fn A_singlet( matching_order_qcd: usize, c: &mut Cache, nf: u8, L: f64, -) -> Vec<[[Complex; 3]; 3]> { - let mut A_s = vec![ - [ - [ - Complex::::zero(), - Complex::::zero(), - Complex::::zero() - ], - [ - Complex::::zero(), - Complex::::zero(), - Complex::::zero() - ], - [ - Complex::::zero(), - Complex::::zero(), - Complex::::zero() - ] - ]; - matching_order_qcd - ]; +) -> [[[Complex; 3]; 3]; MAX_ORDER_QCD] { + let mut A_s = [[[Complex::::zero(); 3]; 3]; MAX_ORDER_QCD]; if matching_order_qcd >= 1 { A_s[0] = as1::A_singlet(c, nf, L); } @@ -43,19 +28,16 @@ pub fn A_singlet( } /// Compute the tower of the non-singlet |OME|. +/// +/// Returns an array of shape `(MAX_ORDER_QCD, d, d)`. Only the first `matching_order_qcd` +/// entries along the outer axis are filled; remaining slots are zero. pub fn A_non_singlet( matching_order_qcd: usize, c: &mut Cache, nf: u8, L: f64, -) -> Vec<[[Complex; 2]; 2]> { - let mut A_ns = vec![ - [ - [Complex::::zero(), Complex::::zero()], - [Complex::::zero(), Complex::::zero()] - ]; - matching_order_qcd - ]; +) -> [[[Complex; 2]; 2]; MAX_ORDER_QCD] { + let mut A_ns = [[[Complex::::zero(); 2]; 2]; MAX_ORDER_QCD]; if matching_order_qcd >= 1 { A_ns[0] = as1::A_ns(c, nf, L); } diff --git a/extras/gsoc/performance.md b/extras/gsoc/performance.md index f07d3af9c..8b0e4b8e7 100644 --- a/extras/gsoc/performance.md +++ b/extras/gsoc/performance.md @@ -158,3 +158,76 @@ This adds a fixed overhead of several milliseconds on every `integrate.quad` nod ### Decision The `nb → rs → nb` architecture introduced in [526](https://github.com/NNPDF/eko/pull/526) was not viable, hence the PR was closed. The preferred short-term path remains `scipy → Rust → Numba → Rust → scipy`. See [architecture.md §5](./architecture.md#5-scipyintegratequad) for the full discussion. + +--- + +## 542: Remove mallocs, single shared buffer + +After [542](https://github.com/NNPDF/eko/pull/542), all mallocs were removed from `eko` and `ekore`, the gamma output is now written directly into a single Python-owned `f64` buffer that is pre-allocated once and passed through `QuadArgs`, minor python loops were removed. + +### Full-suite timing (`/usr/bin/time -v`) + +```text +====================== 3 passed, 12 deselected, 1 warning in 219.57s (0:03:39) ====================== + Command being timed: "poe lha -m nnlo and sv" + User time (seconds): 342.81 + System time (seconds): 2.75 + Percent of CPU this job got: 101% + Elapsed (wall clock) time (h:mm:ss or m:ss): 5:40.23 + Average shared text size (kbytes): 0 + Average unshared data size (kbytes): 0 + Average stack size (kbytes): 0 + Average total size (kbytes): 0 + Maximum resident set size (kbytes): 931916 + Average resident set size (kbytes): 0 + Major (requiring I/O) page faults: 1681 + Minor (reclaiming a frame) page faults: 679694 + Voluntary context switches: 3758 + Involuntary context switches: 3360 + Swaps: 0 + File system inputs: 366568 + File system outputs: 179328 + Socket messages sent: 0 + Socket messages received: 0 + Signals delivered: 0 + Page size (bytes): 4096 + Exit status: 0 +``` + +### Comparison with master + +| | Wall clock | pytest time | +| --- | ---------- | ----------- | +| [master](https://github.com/NNPDF/eko/tree/5aebe0abdea3e75ed7761b5ad4ce31d7fd227cd4) | 6:33 | 4:16 | +| PR #542 | 5:40 | 3:39 | +| **improvement** | | **−14.5%** | + +### Rust/Python time split and memory analysis + +Some debugging revealed where time is actually spent per integration: + +```text +Evolution: Rust/Python split over 891240 quad evaluations — Rust (anomalous dims): 1.573 s (9.5%), Python (numba callback): 15.015 s (90.5%) +Matching: Rust/Python split over 1847076 quad evaluations — Rust (anomalous dims): 2.654 s (11.4%), Python (numba callback): 20.665 s (88.6%) +``` + +~90% of integration time is spent in the Python/Numba callback. This sets a hard ceiling: even eliminating the Rust side entirely could only improve total runtime by ~10%. + +Regarding the peak RSS (~910 MB): this is not a maturin issue. Switching the compile step to `pip install -e . crates/eko` produces nearly identical RSS. The process sits at ~800 MB for the entire run; the peak is caused by a transient spike at the end when computed results are compared against reference values. The 800 MB baseline itself is dominated by Numba `cfunc` compilation at import time: + +```text +[MEM] eko.__init__: START RSS= 58.1 MB (baseline reset) +[MEM] interpolation: loaded (all @njit functions) RSS= 147.3 MB delta=+89.2 MB +[MEM] beta: loaded (all @njit functions) RSS= 186.4 MB delta=+38.8 MB +[MEM] quad_ker: cb_quad_ker_qcd @cfunc loaded RSS= 539.9 MB delta=+353.0 MB +[MEM] quad_ker: cb_quad_ker_ome @cfunc loaded RSS= 557.1 MB delta=+17.1 MB +[MEM] quad_ker: cb_quad_ker_qed @cfunc loaded (all cfuncs done) RSS= 722.9 MB delta=+165.9 MB +[MEM] eko.__init__: END (all submodules loaded) RSS= 722.9 MB +``` + +Of the 722.9 MB loaded at startup, only ~58 MB (~8%) is non-Numba code; the three `cfunc`s alone account for ~74%. The long-term fix is to port them to Rust (see [#383](https://github.com/NNPDF/eko/issues/383)), at which point the memory footprint will drop substantially. + +### Further optimisation attempts and conclusions + +- **quad_vec**: tried passing multiple integrand points at once; failed because `scipy.LowLevelCallable` cannot be used with `quad_vec`, causing a regression to 4:19. +- **SIMD / FMA**: passing batches of points to `rust_quad_ker` could allow LLVM to emit SIMD instructions, but this requires significant changes for a gain capped at ~5–10% on the Rust share (i.e. ≤1% overall). diff --git a/src/eko/evolution_operator/__init__.py.patch b/src/eko/evolution_operator/__init__.py.patch index 1bbb13439..7c0698604 100644 --- a/src/eko/evolution_operator/__init__.py.patch +++ b/src/eko/evolution_operator/__init__.py.patch @@ -1,5 +1,5 @@ diff --git a/src/eko/evolution_operator/__init__.py b/src/eko/evolution_operator/__init__.py -index f3c234f0..3980359e 100644 +index d25e6e4e..6c93b180 100644 --- a/src/eko/evolution_operator/__init__.py +++ b/src/eko/evolution_operator/__init__.py @@ -3,7 +3,6 @@ r"""Contains the central operator classes. @@ -90,13 +90,13 @@ index f3c234f0..3980359e 100644 + cfg.as1 = self.as_list[1] + cfg.as0 = self.as_list[0] + as_list_len = self.as_list.shape[0] -+ for j, a in enumerate(self.as_list.tolist()): -+ cfg.as_list[j] = a ++ as_list = np.ascontiguousarray(self.as_list, dtype=np.float64) ++ ekors.ffi.memmove(cfg.as_list, as_list, as_list.nbytes) + cfg.as_list_len = as_list_len + a_half_x = self.a_half_list.shape[0] + a_half_y = self.a_half_list.shape[1] -+ for j, a in enumerate(self.a_half_list.flatten().tolist()): -+ cfg.a_half[j] = a ++ a_half = np.ascontiguousarray(self.a_half_list, dtype=np.float64) ++ ekors.ffi.memmove(cfg.a_half, a_half, a_half.nbytes) + cfg.a_half_x = a_half_x + cfg.a_half_y = a_half_y + @@ -109,7 +109,7 @@ index f3c234f0..3980359e 100644 def run_op_integration( self, log_grid, -@@ -312,18 +291,62 @@ class Operator(sv.ScaleVariationModeMixin): +@@ -312,18 +291,65 @@ class Operator(sv.ScaleVariationModeMixin): """ column = [] k, logx = log_grid @@ -141,8 +141,15 @@ index f3c234f0..3980359e 100644 + max_area_len = max_areas_shape[0] * max_areas_shape[1] + areas_ffi = ekors.ffi.new("double[]", max_area_len) + cfg.areas = areas_ffi ++ gamma_len = (self.order[0] + 1) * (self.order[1] + 1) * 16 ++ re_gamma_ffi = ekors.ffi.new("double[]", gamma_len) ++ im_gamma_ffi = ekors.ffi.new("double[]", gamma_len) ++ cfg.re_gamma = re_gamma_ffi ++ cfg.im_gamma = im_gamma_ffi + + self.update_cfg(cfg) ++ ++ func = LowLevelCallable(ekors.lib.rust_quad_ker, ekors.ffi.addressof(cfg)) + # iterate basis functions for j, bf in enumerate(self.int_disp): @@ -155,8 +162,8 @@ index f3c234f0..3980359e 100644 temp_dict = {} + # prepare areas for C + curareas = bf.areas_representation -+ for j, x in enumerate(curareas.flatten().tolist()): -+ cfg.areas[j] = x ++ areas = np.ascontiguousarray(curareas, dtype=np.float64) ++ ekors.ffi.memmove(cfg.areas, areas, areas.nbytes) + cfg.areas_x = curareas.shape[0] + cfg.areas_y = curareas.shape[1] # iterate sectors @@ -164,10 +171,6 @@ index f3c234f0..3980359e 100644 + for label in labels: + cfg.mode0 = label[0] + cfg.mode1 = label[1] -+ # construct the low level object -+ func = LowLevelCallable( -+ ekors.lib.rust_quad_ker, ekors.ffi.addressof(cfg) -+ ) res = integrate.quad( - self.quad_ker( - label=label, logx=logx, areas=bf.areas_representation