SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/bit>
16 #include <seqan3/std/concepts>
17 #include <iterator>
18 #include <seqan3/std/ranges>
19 #include <string>
20 #include <vector>
21 
48 
49 namespace seqan3
50 {
51 
63 {
64 public:
68  // string_buffer is of type std::string and has some problems with pre-C++11 ABI
69  format_bam() = default;
70  format_bam(format_bam const &) = default;
71  format_bam & operator=(format_bam const &) = default;
72  format_bam(format_bam &&) = default;
73  format_bam & operator=(format_bam &&) = default;
74  ~format_bam() = default;
75 
77 
80  {
81  { "bam" }
82  };
83 
84 protected:
85  template <typename stream_type, // constraints checked by file
86  typename seq_legal_alph_type,
87  typename ref_seqs_type,
88  typename ref_ids_type,
89  typename seq_type,
90  typename id_type,
91  typename offset_type,
92  typename ref_seq_type,
93  typename ref_id_type,
94  typename ref_offset_type,
95  typename align_type,
96  typename cigar_type,
97  typename flag_type,
98  typename mapq_type,
99  typename qual_type,
100  typename mate_type,
101  typename tag_dict_type,
102  typename e_value_type,
103  typename bit_score_type>
104  void read_alignment_record(stream_type & stream,
105  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
106  ref_seqs_type & ref_seqs,
108  seq_type & seq,
109  qual_type & qual,
110  id_type & id,
111  offset_type & offset,
112  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
113  ref_id_type & ref_id,
114  ref_offset_type & ref_offset,
115  align_type & align,
116  cigar_type & cigar_vector,
117  flag_type & flag,
118  mapq_type & mapq,
119  mate_type & mate,
120  tag_dict_type & tag_dict,
121  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
122  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
123 
124  template <typename stream_type,
125  typename header_type,
126  typename seq_type,
127  typename id_type,
128  typename ref_seq_type,
129  typename ref_id_type,
130  typename align_type,
131  typename cigar_type,
132  typename qual_type,
133  typename mate_type,
134  typename tag_dict_type>
135  void write_alignment_record([[maybe_unused]] stream_type & stream,
136  [[maybe_unused]] sam_file_output_options const & options,
137  [[maybe_unused]] header_type && header,
138  [[maybe_unused]] seq_type && seq,
139  [[maybe_unused]] qual_type && qual,
140  [[maybe_unused]] id_type && id,
141  [[maybe_unused]] int32_t const offset,
142  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
143  [[maybe_unused]] ref_id_type && ref_id,
144  [[maybe_unused]] std::optional<int32_t> ref_offset,
145  [[maybe_unused]] align_type && align,
146  [[maybe_unused]] cigar_type && cigar_vector,
147  [[maybe_unused]] sam_flag const flag,
148  [[maybe_unused]] uint8_t const mapq,
149  [[maybe_unused]] mate_type && mate,
150  [[maybe_unused]] tag_dict_type && tag_dict,
151  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
152  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
153 
154 private:
156  bool header_was_read{false};
157 
160 
163  { // naming corresponds to official SAM/BAM specifications
164  int32_t block_size;
165  int32_t refID;
166  int32_t pos;
167  uint32_t l_read_name:8;
168  uint32_t mapq:8;
169  uint32_t bin:16;
170  uint32_t n_cigar_op:16;
172  int32_t l_seq;
173  int32_t next_refID;
174  int32_t next_pos;
175  int32_t tlen;
176  };
177 
180  {
181  [] () constexpr
182  {
184 
185  using index_t = std::make_unsigned_t<char>;
186 
187  // ret['M'] = 0; set anyway by initialization
188  ret[static_cast<index_t>('I')] = 1;
189  ret[static_cast<index_t>('D')] = 2;
190  ret[static_cast<index_t>('N')] = 3;
191  ret[static_cast<index_t>('S')] = 4;
192  ret[static_cast<index_t>('H')] = 5;
193  ret[static_cast<index_t>('P')] = 6;
194  ret[static_cast<index_t>('=')] = 7;
195  ret[static_cast<index_t>('X')] = 8;
196 
197  return ret;
198  }()
199  };
200 
202  static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
203  {
204  --end;
205  if (beg >> 14 == end >> 14) return ((1 << 15) - 1) / 7 + (beg >> 14);
206  if (beg >> 17 == end >> 17) return ((1 << 12) - 1) / 7 + (beg >> 17);
207  if (beg >> 20 == end >> 20) return ((1 << 9) - 1) / 7 + (beg >> 20);
208  if (beg >> 23 == end >> 23) return ((1 << 6) - 1) / 7 + (beg >> 23);
209  if (beg >> 26 == end >> 26) return ((1 << 3) - 1) / 7 + (beg >> 26);
210  return 0;
211  }
212 
213  using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
214 
221  template <typename stream_view_type, std::integral number_type>
222  void read_field(stream_view_type && stream_view, number_type & target)
223  {
224  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
225  }
226 
232  template <typename stream_view_type>
233  void read_field(stream_view_type && stream_view, float & target)
234  {
235  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
236  }
237 
248  template <typename stream_view_type, typename optional_value_type>
249  void read_field(stream_view_type && stream_view, std::optional<optional_value_type> & target)
250  {
251  optional_value_type tmp;
252  read_field(std::forward<stream_view_type>(stream_view), tmp);
253  target = tmp;
254  }
255 
256  template <typename stream_view_type, typename value_type>
258  stream_view_type && stream_view,
259  value_type const & SEQAN3_DOXYGEN_ONLY(value));
260 
261  template <typename stream_view_type>
262  void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
263 
264  template <typename cigar_input_type>
265  auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
266 
267  static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
268 };
269 
271 template <typename stream_type, // constraints checked by file
272  typename seq_legal_alph_type,
273  typename ref_seqs_type,
274  typename ref_ids_type,
275  typename seq_type,
276  typename id_type,
277  typename offset_type,
278  typename ref_seq_type,
279  typename ref_id_type,
280  typename ref_offset_type,
281  typename align_type,
282  typename cigar_type,
283  typename flag_type,
284  typename mapq_type,
285  typename qual_type,
286  typename mate_type,
287  typename tag_dict_type,
288  typename e_value_type,
289  typename bit_score_type>
290 inline void format_bam::read_alignment_record(stream_type & stream,
291  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
292  ref_seqs_type & ref_seqs,
294  seq_type & seq,
295  qual_type & qual,
296  id_type & id,
297  offset_type & offset,
298  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
299  ref_id_type & ref_id,
300  ref_offset_type & ref_offset,
301  align_type & align,
302  cigar_type & cigar_vector,
303  flag_type & flag,
304  mapq_type & mapq,
305  mate_type & mate,
306  tag_dict_type & tag_dict,
307  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
308  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
309 {
310  static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
311  detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
312  "The ref_offset must be a specialisation of std::optional.");
313 
314  static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
315  "The type of field::mapq must be uint8_t.");
316 
317  static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
318  "The type of field::flag must be seqan3::sam_flag.");
319 
320  auto stream_view = seqan3::views::istreambuf(stream);
321 
322  // these variables need to be stored to compute the ALIGNMENT
323  [[maybe_unused]] int32_t offset_tmp{};
324  [[maybe_unused]] int32_t soft_clipping_end{};
325  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
326  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
327 
328  // Header
329  // -------------------------------------------------------------------------------------------------------------
330  if (!header_was_read)
331  {
332  // magic BAM string
333  if (!std::ranges::equal(stream_view | views::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
334  throw format_error{"File is not in BAM format."};
335 
336  int32_t l_text{}; // length of header text including \0 character
337  int32_t n_ref{}; // number of reference sequences
338  int32_t l_name{}; // 1 + length of reference name including \0 character
339  int32_t l_ref{}; // length of reference sequence
340 
341  read_field(stream_view, l_text);
342 
343  if (l_text > 0) // header text is present
344  read_header(stream_view | views::take_exactly_or_throw(l_text), header, ref_seqs);
345 
346  read_field(stream_view, n_ref);
347 
348  for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
349  {
350  read_field(stream_view, l_name);
351 
352  string_buffer.resize(l_name - 1);
353  std::ranges::copy_n(std::ranges::begin(stream_view), l_name - 1, string_buffer.data()); // copy without \0 character
354  std::ranges::next(std::ranges::begin(stream_view)); // skip \0 character
355 
356  read_field(stream_view, l_ref);
357 
358  if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
359  {
360  // If there was no header text, we parse reference sequences block as header information
361  if (l_text == 0)
362  {
363  auto & reference_ids = header.ref_ids();
364  // put the length of the reference sequence into ref_id_info
365  header.ref_id_info.emplace_back(l_ref, "");
366  // put the reference name into reference_ids
367  reference_ids.push_back(string_buffer);
368  // assign the reference name an ascending reference id (starts at index 0).
369  header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
370  continue;
371  }
372  }
373 
374  auto id_it = header.ref_dict.find(string_buffer);
375 
376  // sanity checks of reference information to existing header object:
377  if (id_it == header.ref_dict.end()) // [unlikely]
378  {
379  throw format_error{detail::to_string("Unknown reference name '" + string_buffer +
380  "' found in BAM file header (header.ref_ids():",
381  header.ref_ids(), ").")};
382  }
383  else if (id_it->second != ref_idx) // [unlikely]
384  {
385  throw format_error{detail::to_string("Reference id '", string_buffer, "' at position ", ref_idx,
386  " does not correspond to the position ", id_it->second,
387  " in the header (header.ref_ids():", header.ref_ids(), ").")};
388  }
389  else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
390  {
391  throw format_error{"Provided reference has unequal length as specified in the header."};
392  }
393  }
394 
395  header_was_read = true;
396 
397  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
398  return;
399  }
400 
401  // read alignment record into buffer
402  // -------------------------------------------------------------------------------------------------------------
404  std::ranges::copy(stream_view | views::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
405 
406  if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
407  {
408  throw format_error{detail::to_string("Reference id index '", core.refID, "' is not in range of ",
409  "header.ref_ids(), which has size ", header.ref_ids().size(), ".")};
410  }
411  else if (core.refID > -1) // not unmapped
412  {
413  ref_id = core.refID; // field::ref_id
414  }
415 
416  flag = core.flag; // field::flag
417  mapq = core.mapq; // field::mapq
418 
419  if (core.pos > -1) // [[likely]]
420  ref_offset = core.pos; // field::ref_offset
421 
422  if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
423  {
424  if (core.next_refID > -1)
425  get<0>(mate) = core.next_refID;
426 
427  if (core.next_pos > -1) // [[likely]]
428  get<1>(mate) = core.next_pos;
429 
430  get<2>(mate) = core.tlen;
431  }
432 
433  // read id
434  // -------------------------------------------------------------------------------------------------------------
435  read_field(stream_view | views::take_exactly_or_throw(core.l_read_name - 1), id); // field::id
436  std::ranges::next(std::ranges::begin(stream_view)); // skip '\0'
437 
438  // read cigar string
439  // -------------------------------------------------------------------------------------------------------------
440  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
441  {
442  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
443  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
444  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
445  }
446  else
447  {
449  }
450 
451  offset = offset_tmp;
452 
453  // read sequence
454  // -------------------------------------------------------------------------------------------------------------
455  if (core.l_seq > 0) // sequence information is given
456  {
457  auto seq_stream = stream_view
458  | views::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
460  {
461  return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
462  dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
463  });
464 
465  if constexpr (detail::decays_to_ignore_v<seq_type>)
466  {
467  auto skip_sequence_bytes = [&] ()
468  {
469  detail::consume(seq_stream);
470  if (core.l_seq & 1)
471  std::ranges::next(std::ranges::begin(stream_view));
472  };
473 
474  if constexpr (!detail::decays_to_ignore_v<align_type>)
475  {
476  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
477  "If you want to read ALIGNMENT but not SEQ, the alignment"
478  " object must store a sequence container at the second (query) position.");
479 
480  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
481  {
482  assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
483  using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
484  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
485 
486  get<1>(align).reserve(seq_length);
487 
488  auto tmp_iter = std::ranges::begin(seq_stream);
489  std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
490 
491  if (offset_tmp & 1)
492  {
493  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
494  ++tmp_iter;
495  --seq_length;
496  }
497 
498  for (size_t i = (seq_length / 2); i > 0; --i)
499  {
500  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
501  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
502  ++tmp_iter;
503  }
504 
505  if (seq_length & 1)
506  {
507  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
508  ++tmp_iter;
509  --soft_clipping_end;
510  }
511 
512  std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
513  }
514  else
515  {
516  skip_sequence_bytes();
517  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
518  }
519  }
520  else
521  {
522  skip_sequence_bytes();
523  }
524  }
525  else
526  {
527  using alph_t = std::ranges::range_value_t<decltype(seq)>;
528  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
529 
530  for (auto [d1, d2] : seq_stream)
531  {
532  seq.push_back(from_dna16[to_rank(d1)]);
533  seq.push_back(from_dna16[to_rank(d2)]);
534  }
535 
536  if (core.l_seq & 1)
537  {
538  dna16sam d = dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
539  seq.push_back(from_dna16[to_rank(d)]);
540  std::ranges::next(std::ranges::begin(stream_view));
541  }
542 
543  if constexpr (!detail::decays_to_ignore_v<align_type>)
544  {
545  assign_unaligned(get<1>(align),
546  seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
547  std::ranges::distance(seq) - soft_clipping_end));
548  }
549  }
550  }
551 
552  // read qual string
553  // -------------------------------------------------------------------------------------------------------------
554  read_field(stream_view | views::take_exactly_or_throw(core.l_seq)
555  | std::views::transform([] (char chr) { return static_cast<char>(chr + 33); }), qual);
556 
557  // All remaining optional fields if any: SAM tags dictionary
558  // -------------------------------------------------------------------------------------------------------------
559  int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4/*block_size excluded*/) -
560  core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
561  assert(remaining_bytes >= 0);
562  auto tags_view = stream_view | views::take_exactly_or_throw(remaining_bytes);
563 
564  while (tags_view.size() > 0)
565  read_field(tags_view, tag_dict);
566 
567  // DONE READING - wrap up
568  // -------------------------------------------------------------------------------------------------------------
569  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
570  {
571  // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
572  // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
573  // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
574  if (core.l_seq != 0 && offset_tmp == core.l_seq)
575  {
576  if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
577  { // maybe only throw in debug mode and otherwise return an empty alignment?
578  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
579  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
580  "stored in the optional field CG. You need to read in the field::tags and "
581  "field::seq in order to access this information.")};
582  }
583  else
584  {
585  auto it = tag_dict.find("CG"_tag);
586 
587  if (it == tag_dict.end())
588  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
589  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
590  "stored in the optional field CG but this tag is not present in the given ",
591  "record.")};
592 
593  auto cigar_view = std::views::all(std::get<std::string>(it->second));
594  std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
595  offset_tmp = soft_clipping_end = 0;
596  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
597  tag_dict.erase(it); // remove redundant information
598 
599  if constexpr (!detail::decays_to_ignore_v<align_type>)
600  {
601  assign_unaligned(get<1>(align),
602  seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
603  std::ranges::distance(seq) - soft_clipping_end));
604  }
605  }
606  }
607  }
608 
609  // Alignment object construction
610  if constexpr (!detail::decays_to_ignore_v<align_type>)
611  construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM
612 
613  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
614  std::swap(cigar_vector, tmp_cigar_vector);
615 }
616 
618 template <typename stream_type,
619  typename header_type,
620  typename seq_type,
621  typename id_type,
622  typename ref_seq_type,
623  typename ref_id_type,
624  typename align_type,
625  typename cigar_type,
626  typename qual_type,
627  typename mate_type,
628  typename tag_dict_type>
629 inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
630  [[maybe_unused]] sam_file_output_options const & options,
631  [[maybe_unused]] header_type && header,
632  [[maybe_unused]] seq_type && seq,
633  [[maybe_unused]] qual_type && qual,
634  [[maybe_unused]] id_type && id,
635  [[maybe_unused]] int32_t const offset,
636  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
637  [[maybe_unused]] ref_id_type && ref_id,
638  [[maybe_unused]] std::optional<int32_t> ref_offset,
639  [[maybe_unused]] align_type && align,
640  [[maybe_unused]] cigar_type && cigar_vector,
641  [[maybe_unused]] sam_flag const flag,
642  [[maybe_unused]] uint8_t const mapq,
643  [[maybe_unused]] mate_type && mate,
644  [[maybe_unused]] tag_dict_type && tag_dict,
645  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
646  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
647 {
648  // ---------------------------------------------------------------------
649  // Type Requirements (as static asserts for user friendliness)
650  // ---------------------------------------------------------------------
651  static_assert((std::ranges::forward_range<seq_type> &&
652  alphabet<std::ranges::range_reference_t<seq_type>>),
653  "The seq object must be a std::ranges::forward_range over "
654  "letters that model seqan3::alphabet.");
655 
656  static_assert((std::ranges::forward_range<id_type> &&
657  alphabet<std::ranges::range_reference_t<id_type>>),
658  "The id object must be a std::ranges::forward_range over "
659  "letters that model seqan3::alphabet.");
660 
661  static_assert((std::ranges::forward_range<ref_seq_type> &&
662  alphabet<std::ranges::range_reference_t<ref_seq_type>>),
663  "The ref_seq object must be a std::ranges::forward_range "
664  "over letters that model seqan3::alphabet.");
665 
666  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
667  {
668  static_assert((std::ranges::forward_range<ref_id_type> ||
669  std::integral<std::remove_reference_t<ref_id_type>> ||
670  detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
671  "The ref_id object must be a std::ranges::forward_range "
672  "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
673  }
674 
676  "The align object must be a std::pair of two ranges whose "
677  "value_type is comparable to seqan3::gap");
678 
679  static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
680  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
681  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
682  "The align object must be a std::pair of two ranges whose "
683  "value_type is comparable to seqan3::gap");
684 
685  static_assert((std::ranges::forward_range<qual_type> &&
686  alphabet<std::ranges::range_reference_t<qual_type>>),
687  "The qual object must be a std::ranges::forward_range "
688  "over letters that model seqan3::alphabet.");
689 
691  "The mate object must be a std::tuple of size 3 with "
692  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
693  "2) a std::integral or std::optional<std::integral>, and "
694  "3) a std::integral.");
695 
696  static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
697  std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
698  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
699  (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
700  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
701  std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
702  "The mate object must be a std::tuple of size 3 with "
703  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
704  "2) a std::integral or std::optional<std::integral>, and "
705  "3) a std::integral.");
706 
707  static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
708  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
709 
710  if constexpr (detail::decays_to_ignore_v<header_type>)
711  {
712  throw format_error{"BAM can only be written with a header but you did not provide enough information! "
713  "You can either construct the output file with ref_ids and ref_seqs information and "
714  "the header will be created for you, or you can access the `header` member directly."};
715  }
716  else
717  {
718  // ---------------------------------------------------------------------
719  // logical Requirements
720  // ---------------------------------------------------------------------
721 
722  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
723  throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
724 
725  detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
726 
727  // ---------------------------------------------------------------------
728  // Writing the BAM Header on first call
729  // ---------------------------------------------------------------------
730  if (!header_was_written)
731  {
732  stream << "BAM\1";
734  write_header(os, options, header); // write SAM header to temporary stream to query the size.
735  int32_t l_text{static_cast<int32_t>(os.str().size())};
736  std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
737 
738  stream << os.str();
739 
740  int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
741  std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
742 
743  for (int32_t ridx = 0; ridx < n_ref; ++ridx)
744  {
745  int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
746  std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
747  // write reference name:
748  std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
749  stream_it = '\0';
750  // write reference sequence length:
751  std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
752  }
753 
754  header_was_written = true;
755  }
756 
757  // ---------------------------------------------------------------------
758  // Writing the Record
759  // ---------------------------------------------------------------------
760  int32_t ref_length{};
761 
762  // if alignment is non-empty, replace cigar_vector.
763  // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
764  if (!std::ranges::empty(cigar_vector))
765  {
766  int32_t dummy_seq_length{};
767  for (auto & [count, operation] : cigar_vector)
768  detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
769  }
770  else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
771  {
772  ref_length = std::ranges::distance(get<1>(align));
773 
774  // compute possible distance from alignment end to sequence end
775  // which indicates soft clipping at the end.
776  // This should be replaced by a free count_gaps function for
777  // aligned sequences which is more efficient if possible.
778  int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
779 
780  for (auto chr : get<1>(align))
781  if (chr == gap{})
782  ++off_end;
783 
784  off_end -= ref_length;
785  cigar_vector = detail::get_cigar_vector(align, offset, off_end);
786  }
787 
788  if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
789  {
790  tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
791  cigar_vector.resize(2);
792  cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
793  cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
794  }
795 
796  std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
797 
798  // Compute the value for the l_read_name field for the bam record.
799  // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
800  // the data type to store the value is uint8_t and 255 is the maximal size.
801  // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
802  // 2 bytes.
803  uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
804  read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
805 
807  {
808  /* block_size */ 0, // will be initialised right after
809  /* refID */ -1, // will be initialised right after
810  /* pos */ ref_offset.value_or(-1),
811  /* l_read_name */ read_name_size,
812  /* mapq */ mapq,
813  /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
814  /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
815  /* flag */ flag,
816  /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
817  /* next_refId */ -1, // will be initialised right after
818  /* next_pos */ get<1>(mate).value_or(-1),
819  /* tlen */ get<2>(mate)
820  };
821 
822  auto check_and_assign_id_to = [&header] ([[maybe_unused]] auto & id_source,
823  [[maybe_unused]] auto & id_target)
824  {
825  using id_t = std::remove_reference_t<decltype(id_source)>;
826 
827  if constexpr (!detail::decays_to_ignore_v<id_t>)
828  {
829  if constexpr (std::integral<id_t>)
830  {
831  id_target = id_source;
832  }
833  else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
834  {
835  id_target = id_source.value_or(-1);
836  }
837  else
838  {
839  if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
840  {
841  auto id_it = header.ref_dict.end();
842 
843  if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
844  std::ranges::sized_range<decltype(id_source)> &&
845  std::ranges::borrowed_range<decltype(id_source)>)
846  {
847  id_it = header.ref_dict.find(std::span{std::ranges::data(id_source),
848  std::ranges::size(id_source)});
849  }
850  else
851  {
852  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
853 
854  static_assert(implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
855  "The ref_id type is not convertible to the reference id information stored in the "
856  "reference dictionary of the header object.");
857 
858  id_it = header.ref_dict.find(id_source);
859  }
860 
861  if (id_it == header.ref_dict.end())
862  {
863  throw format_error{detail::to_string("Unknown reference name '", id_source, "' could "
864  "not be found in BAM header ref_dict: ",
865  header.ref_dict, ".")};
866  }
867 
868  id_target = id_it->second;
869  }
870  }
871  }
872  };
873 
874  // initialise core.refID
875  check_and_assign_id_to(ref_id, core.refID);
876 
877  // initialise core.next_refID
878  check_and_assign_id_to(get<0>(mate), core.next_refID);
879 
880  // initialise core.block_size
881  core.block_size = sizeof(core) - 4/*block_size excluded*/ +
882  core.l_read_name +
883  core.n_cigar_op * 4 + // each int32_t has 4 bytes
884  (core.l_seq + 1) / 2 + // bitcompressed seq
885  core.l_seq + // quality string
886  tag_dict_binary_str.size();
887 
888  std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
889 
890  if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
891  stream_it = '*';
892  else
893  std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
894  stream_it = '\0';
895 
896  // write cigar
897  for (auto [cigar_count, op] : cigar_vector)
898  {
899  cigar_count = cigar_count << 4;
900  cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
901  std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
902  }
903 
904  // write seq (bit-compressed: dna16sam characters go into one byte)
905  using alph_t = std::ranges::range_value_t<seq_type>;
906  constexpr auto to_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
907 
908  auto sit = std::ranges::begin(seq);
909  for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
910  {
911  uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
912  ++sidx, ++sit;
913  compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
914  stream_it = static_cast<char>(compressed_chr);
915  }
916 
917  if (core.l_seq & 1) // write one more
918  stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
919 
920  // write qual
921  if (std::ranges::empty(qual))
922  {
923  auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
924  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
925  }
926  else
927  {
928  if (std::ranges::distance(qual) != core.l_seq)
929  throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
930  core.l_seq, ". Got quality with size ",
931  std::ranges::distance(qual), " instead.")};
932 
933  auto v = qual | std::views::transform([] (auto chr) { return static_cast<char>(to_rank(chr)); });
934  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
935  }
936 
937  // write optional fields
938  stream << tag_dict_binary_str;
939  } // if constexpr (!detail::decays_to_ignore_v<header_type>)
940 }
941 
943 template <typename stream_view_type, typename value_type>
945  stream_view_type && stream_view,
946  value_type const & SEQAN3_DOXYGEN_ONLY(value))
947 {
948  int32_t count;
949  read_field(stream_view, count); // read length of vector
950  std::vector<value_type> tmp_vector;
951  tmp_vector.reserve(count);
952 
953  while (count > 0)
954  {
955  value_type tmp{};
956  read_field(stream_view, tmp);
957  tmp_vector.push_back(std::move(tmp));
958  --count;
959  }
960  variant = std::move(tmp_vector);
961 }
962 
980 template <typename stream_view_type>
981 inline void format_bam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
982 {
983  /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
984  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
985  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
986  VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
987  by the length (int32_t) of the array, followed by the values.
988  */
989  auto it = std::ranges::begin(stream_view);
990  uint16_t tag = static_cast<uint16_t>(*it) << 8;
991  ++it; // skip char read before
992 
993  tag += static_cast<uint16_t>(*it);
994  ++it; // skip char read before
995 
996  char type_id = *it;
997  ++it; // skip char read before
998 
999  switch (type_id)
1000  {
1001  case 'A' : // char
1002  {
1003  target[tag] = *it;
1004  ++it; // skip char that has been read
1005  break;
1006  }
1007  // all integer sizes are possible
1008  case 'c' : // int8_t
1009  {
1010  int8_t tmp;
1011  read_field(stream_view, tmp);
1012  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1013  break;
1014  }
1015  case 'C' : // uint8_t
1016  {
1017  uint8_t tmp;
1018  read_field(stream_view, tmp);
1019  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1020  break;
1021  }
1022  case 's' : // int16_t
1023  {
1024  int16_t tmp;
1025  read_field(stream_view, tmp);
1026  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1027  break;
1028  }
1029  case 'S' : // uint16_t
1030  {
1031  uint16_t tmp;
1032  read_field(stream_view, tmp);
1033  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1034  break;
1035  }
1036  case 'i' : // int32_t
1037  {
1038  int32_t tmp;
1039  read_field(stream_view, tmp);
1040  target[tag] = std::move(tmp); // readable sam format only allows int32_t
1041  break;
1042  }
1043  case 'I' : // uint32_t
1044  {
1045  uint32_t tmp;
1046  read_field(stream_view, tmp);
1047  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1048  break;
1049  }
1050  case 'f' : // float
1051  {
1052  float tmp;
1053  read_field(stream_view, tmp);
1054  target[tag] = tmp;
1055  break;
1056  }
1057  case 'Z' : // string
1058  {
1059  string_buffer.clear();
1060  while (!is_char<'\0'>(*it))
1061  {
1062  string_buffer.push_back(*it);
1063  ++it;
1064  }
1065  ++it; // skip \0
1066  target[tag] = string_buffer;
1067  break;
1068  }
1069  case 'H' : // byte array, represented as null-terminated string; specification requires even number of bytes
1070  {
1071  std::vector<std::byte> byte_array;
1072  std::byte value;
1073  while (!is_char<'\0'>(*it))
1074  {
1075  string_buffer.clear();
1076  string_buffer.push_back(*it);
1077  ++it;
1078 
1079  if (*it == '\0')
1080  throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1081 
1082  string_buffer.push_back(*it);
1083  ++it;
1084  read_field(string_buffer, value);
1085  byte_array.push_back(value);
1086  }
1087  ++it; // skip \0
1088  target[tag] = byte_array;
1089  break;
1090  }
1091  case 'B' : // Array. Value type depends on second char [cCsSiIf]
1092  {
1093  char array_value_type_id = *it;
1094  ++it; // skip char read before
1095 
1096  switch (array_value_type_id)
1097  {
1098  case 'c' : // int8_t
1099  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1100  break;
1101  case 'C' : // uint8_t
1102  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1103  break;
1104  case 's' : // int16_t
1105  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1106  break;
1107  case 'S' : // uint16_t
1108  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1109  break;
1110  case 'i' : // int32_t
1111  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1112  break;
1113  case 'I' : // uint32_t
1114  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1115  break;
1116  case 'f' : // float
1117  read_sam_dict_vector(target[tag], stream_view, float{});
1118  break;
1119  default:
1120  throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1121  "must be one of [cCsSiIf] but '", array_value_type_id, "' was given.")};
1122  }
1123  break;
1124  }
1125  default:
1126  throw format_error{detail::to_string("The second character in the numerical id of a "
1127  "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id, "' was given.")};
1128  }
1129 }
1130 
1145 template <typename cigar_input_type>
1146 inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1147 {
1148  std::vector<cigar> operations{};
1149  char operation{'\0'};
1150  uint32_t count{};
1151  int32_t ref_length{}, seq_length{};
1152  uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1153  constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1154  constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1155 
1156  if (n_cigar_op == 0) // [[unlikely]]
1157  return std::tuple{operations, ref_length, seq_length};
1158 
1159  // parse the rest of the cigar
1160  // -------------------------------------------------------------------------------------------------------------
1161  while (n_cigar_op > 0) // until stream is not empty
1162  {
1163  std::ranges::copy_n(std::ranges::begin(cigar_input),
1164  sizeof(operation_and_count),
1165  reinterpret_cast<char*>(&operation_and_count));
1166  operation = cigar_mapping[operation_and_count & cigar_mask];
1167  count = operation_and_count >> 4;
1168 
1169  detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1170  operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1171  --n_cigar_op;
1172  }
1173 
1174  return std::tuple{operations, ref_length, seq_length};
1175 }
1176 
1181 {
1182  std::string result{};
1183 
1184  auto stream_variant_fn = [&result] (auto && arg) // helper to print an std::variant
1185  {
1186  // T is either char, int32_t, float, std::string, or a std::vector<some int>
1187  using T = std::remove_cvref_t<decltype(arg)>;
1188 
1189  if constexpr (std::same_as<T, int32_t>)
1190  {
1191  // always choose the smallest possible representation [cCsSiI]
1192  size_t const absolute_arg = std::abs(arg);
1193  auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1194  bool const negative = arg < 0;
1195  n = n * n + 2 * negative; // for switch case order
1196 
1197  switch (n)
1198  {
1199  case 0:
1200  {
1201  result[result.size() - 1] = 'C';
1202  result.append(reinterpret_cast<char const *>(&arg), 1);
1203  break;
1204  }
1205  case 1:
1206  {
1207  result[result.size() - 1] = 'S';
1208  result.append(reinterpret_cast<char const *>(&arg), 2);
1209  break;
1210  }
1211  case 2:
1212  {
1213  result[result.size() - 1] = 'c';
1214  int8_t tmp = static_cast<int8_t>(arg);
1215  result.append(reinterpret_cast<char const *>(&tmp), 1);
1216  break;
1217  }
1218  case 3:
1219  {
1220  result[result.size() - 1] = 's';
1221  int16_t tmp = static_cast<int16_t>(arg);
1222  result.append(reinterpret_cast<char const *>(&tmp), 2);
1223  break;
1224  }
1225  default:
1226  {
1227  result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1228  break;
1229  }
1230  }
1231  }
1232  else if constexpr (std::same_as<T, std::string>)
1233  {
1234  result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1/*+ null character*/);
1235  }
1236  else if constexpr (!std::ranges::range<T>) // char, float
1237  {
1238  result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1239  }
1240  else // std::vector of some arithmetic_type type
1241  {
1242  int32_t sz{static_cast<int32_t>(arg.size())};
1243  result.append(reinterpret_cast<char *>(&sz), 4);
1244  result.append(reinterpret_cast<char const *>(arg.data()),
1245  arg.size() * sizeof(std::ranges::range_value_t<T>));
1246  }
1247  };
1248 
1249  for (auto & [tag, variant] : tag_dict)
1250  {
1251  result.push_back(static_cast<char>(tag / 256));
1252  result.push_back(static_cast<char>(tag % 256));
1253 
1254  result.push_back(detail::sam_tag_type_char[variant.index()]);
1255 
1256  if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1257  result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1258 
1259  std::visit(stream_variant_fn, variant);
1260  }
1261 
1262  return result;
1263 }
1264 
1265 } // namespace seqan3
Provides seqan3::detail::convert_through_char_representation.
Provides the C++20 <bit> header if it is not already available.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:211
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:264
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:59
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition: fast_ostreambuf_iterator.hpp:39
The alignment base format.
Definition: format_sam_base.hpp:62
void write_header(stream_t &stream, sam_file_output_options const &options, sam_file_header< ref_ids_type > &header)
Writes the SAM header.
Definition: format_sam_base.hpp:637
void transfer_soft_clipping_to(std::vector< cigar > const &cigar_vector, int32_t &sc_begin, int32_t &sc_end) const
Transfer soft clipping information from the cigar_vector to sc_begin and sc_end.
Definition: format_sam_base.hpp:189
bool header_was_written
A variable that tracks whether the content of header has been written or not.
Definition: format_sam_base.hpp:83
void read_header(stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &)
Reads the SAM header.
Definition: format_sam_base.hpp:419
void construct_alignment(align_type &align, std::vector< cigar > &cigar_vector, [[maybe_unused]] int32_t rid, [[maybe_unused]] ref_seqs_type &ref_seqs, [[maybe_unused]] int32_t ref_start, size_t ref_length)
Construct the field::alignment depending on the given information.
Definition: format_sam_base.hpp:231
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=').
Definition: dna16sam.hpp:48
The actual implementation of seqan3::cigar::operation for documentation purposes only.
Definition: cigar_operation.hpp:48
The BAM format.
Definition: format_bam.hpp:63
format_bam()=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:290
void read_field(stream_view_type &&stream_view, std::optional< optional_value_type > &target)
Delegate parsing of std::optional types to parsing of the inner value type.
Definition: format_bam.hpp:249
void write_alignment_record([[maybe_unused]] stream_type &stream, [[maybe_unused]] sam_file_output_options const &options, [[maybe_unused]] header_type &&header, [[maybe_unused]] seq_type &&seq, [[maybe_unused]] qual_type &&qual, [[maybe_unused]] id_type &&id, [[maybe_unused]] int32_t const offset, [[maybe_unused]] ref_seq_type &&ref_seq, [[maybe_unused]] ref_id_type &&ref_id, [[maybe_unused]] std::optional< int32_t > ref_offset, [[maybe_unused]] align_type &&align, [[maybe_unused]] cigar_type &&cigar_vector, [[maybe_unused]] sam_flag const flag, [[maybe_unused]] uint8_t const mapq, [[maybe_unused]] mate_type &&mate, [[maybe_unused]] tag_dict_type &&tag_dict, [[maybe_unused]] double e_value, [[maybe_unused]] double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:629
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
~format_bam()=default
Defaulted.
void read_field(stream_view_type &&stream_view, float &target)
Reads a float field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:233
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
auto parse_binary_cigar(cigar_input_type &&cigar_input, uint16_t n_cigar_op) const
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: format_bam.hpp:1146
static std::string get_tag_dict_str(sam_tag_dictionary const &tag_dict)
Writes the optional fields of the seqan3::sam_tag_dictionary.
Definition: format_bam.hpp:1180
std::string string_buffer
Local buffer to read into while avoiding reallocation.
Definition: format_bam.hpp:159
static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
Computes the bin number for a given region [beg, end), copied from the official SAM specifications.
Definition: format_bam.hpp:202
void read_field(stream_view_type &&stream_view, number_type &target)
Reads a arithmetic field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:222
void read_sam_dict_vector(seqan3::detail::sam_tag_variant &variant, stream_view_type &&stream_view, value_type const &value)
Reads a list of values separated by comma as it is the case for SAM tag arrays.
Definition: format_bam.hpp:944
bool header_was_read
A variable that tracks whether the content of header has been read or not.
Definition: format_bam.hpp:156
static constexpr std::array< uint8_t, 256 > char_to_sam_rank
Converts a cigar op character to the rank according to the official BAM specifications.
Definition: format_bam.hpp:180
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:80
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:33
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:158
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:155
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:119
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:326
T clear(T... args)
The Concepts library.
Provides various transformation traits used by the range module.
T data(T... args)
Auxiliary for pretty printing of exception messages.
Provides seqan3::debug_stream and related types.
Provides type traits for working with templates.
Provides seqan3::dna16sam.
T emplace_back(T... args)
Provides concepts for core language types and relations that don't have concepts in C++20 (yet).
Provides seqan3::detail::fast_ostreambuf_iterator.
T find(T... args)
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:133
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:199
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:73
std::vector< cigar > get_cigar_vector(alignment_type &&alignment, uint32_t const query_start_pos=0, uint32_t const query_end_pos=0, bool const extended_cigar=false)
Creates a cigar string (SAM format) given a seqan3::detail::pairwise_alignment represented by two seq...
Definition: cigar.hpp:201
std::string get_cigar_string(std::vector< cigar > const &cigar_vector)
Transforms a vector of cigar elements into a string representation.
Definition: cigar.hpp:263
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr void consume(rng_t &&rng)
Iterate over a range (consumes single-pass input ranges).
Definition: misc.hpp:28
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:434
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:168
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:150
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:145
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:114
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly.hpp:91
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:94
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:70
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Provides various utility functions.
Auxiliary functions for the alignment IO.
Provides seqan3::views::istreambuf.
T min(T... args)
std::tuple< std::vector< cigar >, int32_t, int32_t > parse_cigar(cigar_input_type &&cigar_input)
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: cigar.hpp:134
constexpr char sam_tag_type_char_extra[12]
Each types SAM tag type extra char id. Index corresponds to the seqan3::detail::sam_tag_variant types...
Definition: sam_tag_dictionary.hpp:38
void update_alignment_lengths(int32_t &ref_length, int32_t &seq_length, char const cigar_operation, uint32_t const cigar_count)
Updates the sequence lengths by cigar_count depending on the cigar operation op.
Definition: cigar.hpp:105
constexpr char sam_tag_type_char[12]
Each SAM tag type char identifier. Index corresponds to the seqan3::detail::sam_tag_variant types.
Definition: sam_tag_dictionary.hpp:36
std::string to_string(value_type &&...values)
Streams all parameters via the seqan3::debug_stream and returns a concatenated string.
Definition: to_string.hpp:29
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
T push_back(T... args)
Provides various utility functions.
Adaptations of concepts from the Ranges TS.
T reserve(T... args)
T resize(T... args)
Provides the seqan3::format_sam_base that can be inherited from.
Provides the seqan3::sam_file_header class.
Provides seqan3::sam_file_input_format and auxiliary classes.
Provides seqan3::sam_file_input_options.
Provides seqan3::sam_file_output_format and auxiliary classes.
Provides seqan3::sam_file_output_options.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides helper data structures for the seqan3::sam_file_output.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Stores all fixed length variables which can be read/written directly by reinterpreting the binary str...
Definition: format_bam.hpp:163
uint32_t bin
The bin number.
Definition: format_bam.hpp:169
sam_flag flag
The flag value (uint16_t enum).
Definition: format_bam.hpp:171
uint32_t n_cigar_op
The number of cigar operations of the alignment.
Definition: format_bam.hpp:170
int32_t next_refID
The reference id of the mate.
Definition: format_bam.hpp:173
int32_t l_seq
The number of bases of the read sequence.
Definition: format_bam.hpp:172
int32_t refID
The reference id the read was mapped to.
Definition: format_bam.hpp:165
int32_t pos
The begin position of the alignment.
Definition: format_bam.hpp:166
int32_t block_size
The size in bytes of the whole BAM record.
Definition: format_bam.hpp:164
uint32_t l_read_name
The length of the read name including the \0 character.
Definition: format_bam.hpp:167
int32_t tlen
The template length of the read and its mate.
Definition: format_bam.hpp:175
uint32_t mapq
The mapping quality.
Definition: format_bam.hpp:168
int32_t next_pos
The begin position of the mate alignment.
Definition: format_bam.hpp:174
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:88
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:24
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:23
Exposes the value_type of another type.
Definition: pre.hpp:58
T swap(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
T tuple_size_v
Provides character predicates for tokenisation.
Provides seqan3::tuple_like.
T visit(T... args)