docs/html/aln__filter_8cpp_source.html

 #define RANGELESS_FN_ENABLE_PARALLEL 1
 #include <fn.hpp>
 #include <iostream>

 // A real-world -inspired example showcasing same of the fn:: functionality:
 //
 // Problem Statement:
 //
 // Given a stream of mrna-to-chromosome alignments (aln_t), sorted by gene-id,
 // filter it as follows, lazily, i.e. fetching from the stream incrementally as-necessary:
 //
 // Per gene_id:
 //      1) Drop alignments where exists an alignment with seq-id
 //         having same mrna-accession and higher mrna-version,
 //         e.g. drop ("NM_002020",5) if ("NM_002020",6) exists.
 //
 //      2) Realign inputs, in parallel, using supplied function realign:aln_t->alns_t
 //
 //      3) Keep top-scoring alignments per mrna-id.
 //
 //      4) Drop duplicates (same mrna-id, chr-id, chr-start, chr-stop).
 //
 //      5) Then select alignments for gene sharing a single common genomic-cds-start:
 //          1) Drop alignments without a valid cds-start.
 //          2) Prefer positions supported by more alignments.
 //          3) Then prefer "NC_*" chr-id.
 //          4) Then prefer lower chr-id.
 //          5) Then prefer lower (more upstream) cds-start.
 //
 //      6) Sort by decreasing alignment score, increasing mrna-id.
 //
 // The above processing steps shall not make copies of aln_t.
 //
 // The aln_filter(...) below has:
 //                      Number of if-statements: 0
 //                              Number of loops: 0
 //                   Control-flow nesting level: 0
 //                      Direct use of iterators: 0
 // Non-const variables (mutable state) declared: 0
 //              Statements mutating local state: 0
 //           Statements mutating external state: 0
 //          Compile-time for the entire example: ~2.8s.
 //
 namespace example
 {

 struct aln_t
 {
     using accession_t = std::string;
     using version_t   = uint32_t;
     using seq_id_t    = std::pair<accession_t, version_t>;
     using gene_id_t   = int32_t;
     using pos_t       = int64_t; // genomic position on chr.
                                  // signed; 1-based; 0 is invalid-pos;
                                  // x and -x refer to same nucleotide position
                                  // in forward and reverse orientations.

     static constexpr pos_t invalid_pos = 0;

     //-----------------------------------------------------------------------

       int64_t aln_id;
     gene_id_t gene_id;
      seq_id_t mrna_id;

      seq_id_t chr_id;
         pos_t chr_start;
         pos_t chr_stop;
         pos_t chr_cds_start_pos;

       int64_t score;

     struct alignment_details
     {
         //...
     };

     //---------------------------------------------------------------------------
     // Will make our type move-only to assert that our filtering
     // steps do not silently make copies under the hood.
     // (will fail to compile if it tries to)

                aln_t(const aln_t&) = delete;
     aln_t& operator=(const aln_t&) = delete;

                     aln_t(aln_t&&) = default;
          aln_t& operator=(aln_t&&) = default;

                            aln_t() = default;

 };

 using alns_t = std::vector<aln_t>;


 namespace fn = rangeless::fn;
 using fn::operators::operator%;   // arg % f % g % h; returns h(g(f(std::forward<Arg>(arg))));


 //---------------------------------------------------------------------------
 // Other similar libraries in c++ or other languages do not typically
 // provide "filter to minimal (or maximal) elements" constructs,
 // (fn::where_min_by fn::where_max_by), or fn::group_all_by, fn::unique_all_by,
 // or lazy transform_in_parallel.
 //
 // For demonstration we'll implement them below ourselves,
 // and will then use my::group_all_by instead of fn::group_all_by, etc.
 namespace my
 {

 static auto group_all_by = [](auto key_fn)
 {
     return [key_fn = std::move(key_fn)](auto inputs)
     {
         return std::move(inputs)
       % fn::sort_by(key_fn)
       % fn::group_adjacent_by(key_fn);
     };
 };

 //---------------------------------------------------------------------------

 static auto unique_all_by = [](auto key_fn)
 {
     return [key_fn = std::move(key_fn)](auto inputs)
     {
         return std::move(inputs)
       % fn::sort_by(key_fn)
       % fn::unique_adjacent_by(key_fn);
     };
 };

 //---------------------------------------------------------------------------

 static auto where_min_by = [](auto key_fn)
 {
     return [key_fn = std::move(key_fn)](auto inputs)
     {
         // NB: implementation in fn:: is more involved to avoid sort/group.
         return std::move(inputs)
       % fn::sort_by(key_fn)      // could use fn::lazy_sort_by here, because
                                  // we only need the first group below, but
                                  // lazy_sort_by is not stable.
       % fn::group_adjacent_by(key_fn)
       % fn::take_first(1)        // min-elements are in the first group
       % fn::concat();            // [[min-elements]] -> [min-elements]
     };
 };

 //---------------------------------------------------------------------------

 static auto where_max_by = [](auto key_fn)
 {
     return my::where_min_by( fn::by::decreasing( std::move(key_fn)));
 };

 //---------------------------------------------------------------------------

 static auto lazy_transform_in_parallel = [](auto fn,
                                           size_t max_queue_size = std::thread::hardware_concurrency())
 {
     assert(max_queue_size >= 1);

     return [max_queue_size, fn](auto inputs) // inputs can be an lazy InputRange
     {
         return std::move(inputs)

         //-------------------------------------------------------------------
         // Lazily yield std::async invocations of fn.

       % fn::transform([fn](auto inp)
         {
             return std::async( std::launch::async,
                 [inp = std::move(inp), fn]() mutable // mutable because inp will be moved-from
                 {
                     return fn(std::move(inp));
                 });
         })

         //-------------------------------------------------------------------
         // Cap the incoming sequence of tasks with a seq of `max_queue_size`-1
         // dummy future<...>'s, such that all real tasks make it
         // from the other end of the sliding-window in the next stage.

       % fn::append( fn::seq([i = 1UL, max_queue_size]() mutable
         {
             using fn_out_t = decltype( fn( std::move( *inputs.begin())));
             return i++ < max_queue_size ? std::future<fn_out_t>() : fn::end_seq();
         }))

         //-------------------------------------------------------------------
         // Buffer executing async-tasks in a fixed-sized sliding window;
         // yield the result from the oldest (front) std::future.

       % fn::sliding_window(max_queue_size)

       % fn::transform([](auto view) // a view from a window? Get out!
         {
             return view.begin()->get();
         });
     };
 };


 // for demonstration: suppose a single invocation of transform-function is too small
 // compared to async-invocation overhead, so we want to amortize the overhead by batching:
 static auto batched_lazy_transform_in_parallel = [](auto fn,
                                                   size_t max_queue_size = std::thread::hardware_concurrency(),
                                                   size_t batch_size = 2)
 {
     return [=](auto inputs)
     {
         return std::move(inputs)
       % fn::in_groups_of(batch_size)
       % my::lazy_transform_in_parallel( [&](auto inputs_batch)
         {
             return std::move(inputs_batch)
                  % fn::transform(fn)
                  % fn::to_vector(); // NB: since fn::transform is lazy,
                                     // we need to force eager-evalution
                                     // within this batch-transform function.
         }, max_queue_size)
       % fn::concat(); // flatten the batches of outputs
     };
 };


 } //namespace my


 //---------------------------------------------------------------------------

 static alns_t realign(aln_t a) // realign stub: just return the original
 {
     alns_t ret;
     ret.push_back(std::move(a));
     return ret;
 }

 #define LAMBDA(expr) ([&](const auto& _){ return expr; })


 //---------------------------------------------------------------------------
 // Filtering steps (5) and (6)

 static auto filter_to_unique_cds_for_gene(alns_t alns_for_gene) -> alns_t
 {
     return std::move(alns_for_gene)

     // (5.1) Keep alignments with valid cds-start.

   % fn::where LAMBDA( _.chr_cds_start_pos != aln_t::invalid_pos )

     //-------------------------------------------------------------------
     // (5.2) Keep alignments with most-ubiquitous valid cds-starts.

   % my::group_all_by LAMBDA( _.chr_cds_start_pos )
   % my::where_max_by LAMBDA( _.size() )
   % fn::concat()

     //-------------------------------------------------------------------
     // Filter to unique chr_cds_start_pos.
     // (5.3) Prefer on "NC_*" chr-accession,
     // (5.4) then lower chr-id,
     // (5.5) then more upstream cds-start.

   % my::where_min_by LAMBDA(
           std::make_tuple( _.chr_id.first.find("NC_") != 0,
                 std::cref( _.chr_id ),
                            _.chr_cds_start_pos))

 #if 1
     //-------------------------------------------------------------------
     // (6) Sort by decreasing alignment score, then by increasing mrna-id.

   % fn::sort_by LAMBDA(
           std::make_pair( fn::by::decreasing( _.score),
                                    std::cref( _.mrna_id) ))

 #else // alternatively, e.g if you want to use your own sort

   % [](alns_t alns)
     {
         gfx::timsort(
             alns.begin(), alns.end(),
             fn::by::make_comp([](const aln_t& a)
             {
                 return std::make_pair(
                    fn::by::decreasing( a.score),
                             std::cref( a.mrna_id));
             });
         return std::move(alns);
     }
 #endif
     ; // end of return-statement
 }


 //---------------------------------------------------------------------------

 // Implement as lambda so can rely on the automatic return-type deduction,
 // which will be some longwindedly-named lazy seq<...>

 static auto aln_filter = [](auto alns_seq) // alns may be a lazy input-sequence (i.e. an InputRange)
 {
     return std::move(alns_seq)

     //-----------------------------------------------------------------------
     // (1) Filter to latest mRNA-version per mRNA-accession

   % fn::group_adjacent_by( std::mem_fn( &aln_t::gene_id))
   % fn::transform( [](alns_t alns_for_gene) -> alns_t
     {
         return std::move(alns_for_gene)
       % my::group_all_by LAMBDA( std::cref( _.mrna_id.first))
       % fn::transform(
               my::where_max_by( std::mem_fn( &aln_t::mrna_id)))
       % fn::concat(); // un-group
     })
   % fn::concat()

     //-----------------------------------------------------------------------
     // (2) Realign in parallel

   % my::batched_lazy_transform_in_parallel(realign) // aln_t -> alns_t
   % fn::concat()

     //-----------------------------------------------------------------------
     // Per mrna-id:
     // (3) Keep top-scoring
     // (4) Drop duplicates

   % fn::group_adjacent_by( std::mem_fn( &aln_t::mrna_id)) // were made adjacent in (1)
   % fn::transform( [](alns_t alns_for_mrna) -> alns_t
     {
         return std::move(alns_for_mrna)
       % my::where_max_by( std::mem_fn( &aln_t::score))
       % my::unique_all_by LAMBDA(
               std::tie( _.mrna_id, _.chr_id, _.chr_start, _.chr_stop ));
     })
   % fn::concat()

     //-----------------------------------------------------------------------
     // (5), (6)

   % fn::group_adjacent_by( std::mem_fn( &aln_t::gene_id))
   % fn::transform( example::filter_to_unique_cds_for_gene)
   % fn::concat();
 };

     // Curiously, group-by/transform/concat pattern appears to be
     // very common. Perhaps it needs a separate abstraction?

 }   // namespace example

 //---------------------------------------------------------------------------

 int main()
 {
     using namespace example;

     alns_t alns{}; // normally these would come from a stream, but for the sake of example will yield from a vec.

     // GeneID:2
     alns.push_back(aln_t{ 101, 2, {"NM_000001", 2}, {"NC_000001", 1}, 1000000, 1001000, 100100, 100}); // keep.
     alns.push_back(aln_t{ 102, 2, {"NM_000001", 2}, {"NC_000001", 1}, 1000000, 1001000, 100100, 100}); // duplicate.
     alns.push_back(aln_t{ 103, 2, {"NM_000001", 2}, {"NC_000001", 1}, 1000001, 1001000, 100100, 50 }); // not top-scoring for this mrna.
     alns.push_back(aln_t{ 104, 2, {"NM_000001", 1}, {"NC_000001", 1}, 1000000, 1001000, 100100, 100}); // superceded mrna-version.
     alns.push_back(aln_t{ 201, 2, {"NM_000002", 1}, {"NC_000001", 1}, 1000000, 1001000, 0,      100}); // no valid-CDS.
     alns.push_back(aln_t{ 301, 2, {"NM_000003", 1}, {"NC_000001", 1}, 1000000, 1001000, 0,      100}); // no valid-CDS.
     alns.push_back(aln_t{ 401, 2, {"NM_000004", 1}, {"NC_000001", 1}, 1000000, 1001000, 0,      100}); // no valid-CDS.
     alns.push_back(aln_t{ 501, 2, {"NM_000005", 1}, {"NC_000001", 1}, 1000000, 1001000, 100100, 110}); // keep.
     alns.push_back(aln_t{ 801, 2, {"NM_000008", 1}, {"NC_000001", 1}, 1000000, 1001000, 100200, 100}); // not most-supported-CDS.

     // GeneID:3
     alns.push_back(aln_t{ 601, 3, {"NM_000005", 1}, {"NC_000001", 1}, 1000000, 1001000, 100100, 100});  // keep.
     alns.push_back(aln_t{ 701, 3, {"NM_000007", 1}, {"NT_000001", 1}, 1000000, 1001000, 100100, 100});  // not on NC.

     namespace fn = rangeless::fn;
     using fn::operators::operator%;

     std::vector<int64_t> kept_ids{};

     // we could just std::move(alns) instead of fn::seq(...) here,
     // but demonstrating that input can also be a lazy seq, e.g. deserializing from an istream.
     fn::seq([&, i = 0UL]() mutable -> aln_t
     {
         return i < alns.size() ? std::move(alns[i++]) : fn::end_seq();
     })

   % example::aln_filter

   % fn::for_each( [&](aln_t a)
     {
         std::cerr << a.gene_id << "\t" << a.aln_id << "\n";
         kept_ids.push_back(a.aln_id);
     });

     assert((kept_ids == std::vector<int64_t>{{ 501, 101, 601 }} ));

     return 0;
 }

rangeless::fn::group_adjacent_by
impl::group_adjacent_by< F > group_adjacent_by(F key_fn)
Group adjacent elements.
Definition: fn.hpp:4059

example::my::unique_all_by
static auto unique_all_by
Definition: aln_filter.cpp:123

example::aln_t::gene_id
gene_id_t gene_id
Definition: aln_filter.cpp:63

rangeless::fn::where
impl::where< P > where(P pred)
Filter elements.
Definition: fn.hpp:3903

example::filter_to_unique_cds_for_gene
static auto filter_to_unique_cds_for_gene(alns_t alns_for_gene) -> alns_t
Definition: aln_filter.cpp:246

example::aln_t::chr_id
seq_id_t chr_id
Definition: aln_filter.cpp:66

rangeless::fn::in_groups_of
impl::group_adjacent_by< impl::chunker > in_groups_of(size_t n)
Group adjacent elements into chunks of specified size.
Definition: fn.hpp:4135

example::aln_t::alignment_details
Definition: aln_filter.cpp:73

example::my::where_max_by
static auto where_max_by
Definition: aln_filter.cpp:152

example::my::lazy_transform_in_parallel
static auto lazy_transform_in_parallel
Definition: aln_filter.cpp:159

example::aln_t::aln_id
int64_t aln_id
Definition: aln_filter.cpp:62

example::my::batched_lazy_transform_in_parallel
static auto batched_lazy_transform_in_parallel
Definition: aln_filter.cpp:207

example::aln_t::score
int64_t score
Definition: aln_filter.cpp:71

example::aln_t::chr_start
pos_t chr_start
Definition: aln_filter.cpp:67

example::aln_t::chr_stop
pos_t chr_stop
Definition: aln_filter.cpp:68

example::aln_filter
static auto aln_filter
Definition: aln_filter.cpp:304

example
Definition: aln_filter.cpp:44

rangeless::fn::concat
impl::concat concat()
Flatten the result of group_all_by or group_adjacent_by.
Definition: fn.hpp:4291

rangeless::fn::sliding_window
impl::sliding_window sliding_window(size_t win_size)
Definition: fn.hpp:3801

example::aln_t::accession_t
std::string accession_t
Definition: aln_filter.cpp:49

rangeless::fn::sort_by
impl::sort_by< F, impl::stable_sort_tag > sort_by(F key_fn)
stable-sort and return the input.
Definition: fn.hpp:4174

example::my::group_all_by
static auto group_all_by
Definition: aln_filter.cpp:111

rangeless::fn::take_first
impl::take_while< impl::call_count_lt > take_first(size_t n=1)
Yield first n elements.
Definition: fn.hpp:3838

example::aln_t::aln_t
aln_t()=default

rangeless::fn
LINQ -like library of higher-order functions for data manipulation.
Definition: fn.hpp:58

rangeless::fn::by::decreasing
impl::gt< T > decreasing(T x)
Wraps the passed value and exposes inverted operator<.
Definition: fn.hpp:898

example::aln_t::seq_id_t
std::pair< accession_t, version_t > seq_id_t
Definition: aln_filter.cpp:51

rangeless::fn::end_seq
Return fn::end_seq() from input-range generator function to signal end-of-inputs.
Definition: fn.hpp:281

rangeless::fn::seq
impl::seq< impl::catch_end< NullaryInvokable > > seq(NullaryInvokable gen_fn)
Adapt a generator function as InputRange.
Definition: fn.hpp:677

main
int main()
Definition: aln_filter.cpp:358

example::aln_t::invalid_pos
static constexpr pos_t invalid_pos
Definition: aln_filter.cpp:58

LAMBDA
#define LAMBDA(expr)
Definition: aln_filter.cpp:240

example::aln_t::mrna_id
seq_id_t mrna_id
Definition: aln_filter.cpp:64

rangeless::fn::to_vector
impl::to_vector to_vector()
Move elements of an Iterable to std::vector.
Definition: fn.hpp:3459

fn.hpp

rangeless::fn::append
impl::append< Iterable > append(Iterable next)
Yield elements of next after elements of arg.
Definition: fn.hpp:4307

example::aln_t::chr_cds_start_pos
pos_t chr_cds_start_pos
Definition: aln_filter.cpp:69

rangeless::fn::transform
impl::transform< F > transform(F map_fn)
Create a seq yielding results of applying the transform functions to input-elements.
Definition: fn.hpp:3670

rangeless::fn::by::make_comp
impl::comp< F > make_comp(F key_fn)
Make binary comparison predicate from a key-function.
Definition: fn.hpp:927

example::alns_t
std::vector< aln_t > alns_t
Definition: aln_filter.cpp:93

example::aln_t::gene_id_t
int32_t gene_id_t
Definition: aln_filter.cpp:52

example::aln_t::pos_t
int64_t pos_t
Definition: aln_filter.cpp:53

rangeless::fn::unique_adjacent_by
impl::unique_adjacent_by< F > unique_adjacent_by(F key_fn)
Keep first element from every adjacently-equal run of elements.
Definition: fn.hpp:4251

example::aln_t::operator=
aln_t & operator=(const aln_t &)=delete

rangeless::fn::for_each
impl::for_each< F > for_each(F fn)
Definition: fn.hpp:3811

example::aln_t::version_t
uint32_t version_t
Definition: aln_filter.cpp:50

example::my::where_min_by
static auto where_min_by
Definition: aln_filter.cpp:135

example::aln_t
Definition: aln_filter.cpp:47

example::realign
static alns_t realign(aln_t a)
Definition: aln_filter.cpp:233