SeqAn3  3.1.0
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/bit>
16 #include <iterator>
17 #include <seqan3/std/ranges>
18 #include <string>
19 #include <vector>
20 
33 
34 namespace seqan3
35 {
36 
49 class format_bam : private detail::format_sam_base
50 {
51 public:
55  // string_buffer is of type std::string and has some problems with pre-C++11 ABI
56  format_bam() = default;
57  format_bam(format_bam const &) = default;
58  format_bam & operator=(format_bam const &) = default;
59  format_bam(format_bam &&) = default;
60  format_bam & operator=(format_bam &&) = default;
61  ~format_bam() = default;
62 
64 
67  {
68  { "bam" }
69  };
70 
71 protected:
72  template <typename stream_type, // constraints checked by file
73  typename seq_legal_alph_type,
74  typename ref_seqs_type,
75  typename ref_ids_type,
76  typename seq_type,
77  typename id_type,
78  typename offset_type,
79  typename ref_seq_type,
80  typename ref_id_type,
81  typename ref_offset_type,
82  typename align_type,
83  typename cigar_type,
84  typename flag_type,
85  typename mapq_type,
86  typename qual_type,
87  typename mate_type,
88  typename tag_dict_type,
89  typename e_value_type,
90  typename bit_score_type>
91  void read_alignment_record(stream_type & stream,
92  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
93  ref_seqs_type & ref_seqs,
95  seq_type & seq,
96  qual_type & qual,
97  id_type & id,
98  offset_type & offset,
99  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
100  ref_id_type & ref_id,
101  ref_offset_type & ref_offset,
102  align_type & align,
103  cigar_type & cigar_vector,
104  flag_type & flag,
105  mapq_type & mapq,
106  mate_type & mate,
107  tag_dict_type & tag_dict,
108  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
109  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
110 
111  template <typename stream_type,
112  typename header_type,
113  typename seq_type,
114  typename id_type,
115  typename ref_seq_type,
116  typename ref_id_type,
117  typename align_type,
118  typename cigar_type,
119  typename qual_type,
120  typename mate_type,
121  typename tag_dict_type>
122  void write_alignment_record([[maybe_unused]] stream_type & stream,
123  [[maybe_unused]] sam_file_output_options const & options,
124  [[maybe_unused]] header_type && header,
125  [[maybe_unused]] seq_type && seq,
126  [[maybe_unused]] qual_type && qual,
127  [[maybe_unused]] id_type && id,
128  [[maybe_unused]] int32_t const offset,
129  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
130  [[maybe_unused]] ref_id_type && ref_id,
131  [[maybe_unused]] std::optional<int32_t> ref_offset,
132  [[maybe_unused]] align_type && align,
133  [[maybe_unused]] cigar_type && cigar_vector,
134  [[maybe_unused]] sam_flag const flag,
135  [[maybe_unused]] uint8_t const mapq,
136  [[maybe_unused]] mate_type && mate,
137  [[maybe_unused]] tag_dict_type && tag_dict,
138  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
139  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
140 
141 private:
143  bool header_was_read{false};
144 
146  std::string string_buffer{};
147 
149  struct alignment_record_core
150  { // naming corresponds to official SAM/BAM specifications
151  int32_t block_size;
152  int32_t refID;
153  int32_t pos;
154  uint32_t l_read_name:8;
155  uint32_t mapq:8;
156  uint32_t bin:16;
157  uint32_t n_cigar_op:16;
158  sam_flag flag;
159  int32_t l_seq;
160  int32_t next_refID;
161  int32_t next_pos;
162  int32_t tlen;
163  };
164 
166  static constexpr std::array<uint8_t, 256> char_to_sam_rank
167  {
168  [] () constexpr
169  {
171 
172  using index_t = std::make_unsigned_t<char>;
173 
174  // ret['M'] = 0; set anyway by initialization
175  ret[static_cast<index_t>('I')] = 1;
176  ret[static_cast<index_t>('D')] = 2;
177  ret[static_cast<index_t>('N')] = 3;
178  ret[static_cast<index_t>('S')] = 4;
179  ret[static_cast<index_t>('H')] = 5;
180  ret[static_cast<index_t>('P')] = 6;
181  ret[static_cast<index_t>('=')] = 7;
182  ret[static_cast<index_t>('X')] = 8;
183 
184  return ret;
185  }()
186  };
187 
189  static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
190  {
191  --end;
192  if (beg >> 14 == end >> 14) return ((1 << 15) - 1) / 7 + (beg >> 14);
193  if (beg >> 17 == end >> 17) return ((1 << 12) - 1) / 7 + (beg >> 17);
194  if (beg >> 20 == end >> 20) return ((1 << 9) - 1) / 7 + (beg >> 20);
195  if (beg >> 23 == end >> 23) return ((1 << 6) - 1) / 7 + (beg >> 23);
196  if (beg >> 26 == end >> 26) return ((1 << 3) - 1) / 7 + (beg >> 26);
197  return 0;
198  }
199 
200  using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
201 
208  template <typename stream_view_type, std::integral number_type>
209  void read_field(stream_view_type && stream_view, number_type & target)
210  {
211  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
212  }
213 
219  template <typename stream_view_type>
220  void read_field(stream_view_type && stream_view, float & target)
221  {
222  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
223  }
224 
235  template <typename stream_view_type, typename optional_value_type>
236  void read_field(stream_view_type && stream_view, std::optional<optional_value_type> & target)
237  {
238  optional_value_type tmp;
239  read_field(std::forward<stream_view_type>(stream_view), tmp);
240  target = tmp;
241  }
242 
243  template <typename stream_view_type, typename value_type>
244  void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
245  stream_view_type && stream_view,
246  value_type const & SEQAN3_DOXYGEN_ONLY(value));
247 
248  template <typename stream_view_type>
249  void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
250 
251  template <typename cigar_input_type>
252  auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
253 
254  static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
255 };
256 
258 template <typename stream_type, // constraints checked by file
259  typename seq_legal_alph_type,
260  typename ref_seqs_type,
261  typename ref_ids_type,
262  typename seq_type,
263  typename id_type,
264  typename offset_type,
265  typename ref_seq_type,
266  typename ref_id_type,
267  typename ref_offset_type,
268  typename align_type,
269  typename cigar_type,
270  typename flag_type,
271  typename mapq_type,
272  typename qual_type,
273  typename mate_type,
274  typename tag_dict_type,
275  typename e_value_type,
276  typename bit_score_type>
277 inline void format_bam::read_alignment_record(stream_type & stream,
278  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
279  ref_seqs_type & ref_seqs,
281  seq_type & seq,
282  qual_type & qual,
283  id_type & id,
284  offset_type & offset,
285  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
286  ref_id_type & ref_id,
287  ref_offset_type & ref_offset,
288  align_type & align,
289  cigar_type & cigar_vector,
290  flag_type & flag,
291  mapq_type & mapq,
292  mate_type & mate,
293  tag_dict_type & tag_dict,
294  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
295  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
296 {
297  static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
298  detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
299  "The ref_offset must be a specialisation of std::optional.");
300 
301  static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
302  "The type of field::mapq must be uint8_t.");
303 
304  static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
305  "The type of field::flag must be seqan3::sam_flag.");
306 
307  auto stream_view = seqan3::detail::istreambuf(stream);
308 
309  // these variables need to be stored to compute the ALIGNMENT
310  [[maybe_unused]] int32_t offset_tmp{};
311  [[maybe_unused]] int32_t soft_clipping_end{};
312  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
313  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
314 
315  // Header
316  // -------------------------------------------------------------------------------------------------------------
317  if (!header_was_read)
318  {
319  // magic BAM string
320  if (!std::ranges::equal(stream_view | detail::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
321  throw format_error{"File is not in BAM format."};
322 
323  int32_t l_text{}; // length of header text including \0 character
324  int32_t n_ref{}; // number of reference sequences
325  int32_t l_name{}; // 1 + length of reference name including \0 character
326  int32_t l_ref{}; // length of reference sequence
327 
328  read_field(stream_view, l_text);
329 
330  if (l_text > 0) // header text is present
331  read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
332 
333  read_field(stream_view, n_ref);
334 
335  for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
336  {
337  read_field(stream_view, l_name);
338 
339  string_buffer.resize(l_name - 1);
340  std::ranges::copy_n(std::ranges::begin(stream_view), l_name - 1, string_buffer.data()); // copy without \0 character
341  std::ranges::next(std::ranges::begin(stream_view)); // skip \0 character
342 
343  read_field(stream_view, l_ref);
344 
345  if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
346  {
347  // If there was no header text, we parse reference sequences block as header information
348  if (l_text == 0)
349  {
350  auto & reference_ids = header.ref_ids();
351  // put the length of the reference sequence into ref_id_info
352  header.ref_id_info.emplace_back(l_ref, "");
353  // put the reference name into reference_ids
354  reference_ids.push_back(string_buffer);
355  // assign the reference name an ascending reference id (starts at index 0).
356  header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
357  continue;
358  }
359  }
360 
361  auto id_it = header.ref_dict.find(string_buffer);
362 
363  // sanity checks of reference information to existing header object:
364  if (id_it == header.ref_dict.end()) // [unlikely]
365  {
366  throw format_error{detail::to_string("Unknown reference name '" + string_buffer +
367  "' found in BAM file header (header.ref_ids():",
368  header.ref_ids(), ").")};
369  }
370  else if (id_it->second != ref_idx) // [unlikely]
371  {
372  throw format_error{detail::to_string("Reference id '", string_buffer, "' at position ", ref_idx,
373  " does not correspond to the position ", id_it->second,
374  " in the header (header.ref_ids():", header.ref_ids(), ").")};
375  }
376  else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
377  {
378  throw format_error{"Provided reference has unequal length as specified in the header."};
379  }
380  }
381 
382  header_was_read = true;
383 
384  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
385  return;
386  }
387 
388  // read alignment record into buffer
389  // -------------------------------------------------------------------------------------------------------------
390  alignment_record_core core;
391  std::ranges::copy(stream_view | detail::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
392 
393  if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
394  {
395  throw format_error{detail::to_string("Reference id index '", core.refID, "' is not in range of ",
396  "header.ref_ids(), which has size ", header.ref_ids().size(), ".")};
397  }
398  else if (core.refID > -1) // not unmapped
399  {
400  ref_id = core.refID; // field::ref_id
401  }
402 
403  flag = core.flag; // field::flag
404  mapq = core.mapq; // field::mapq
405 
406  if (core.pos > -1) // [[likely]]
407  ref_offset = core.pos; // field::ref_offset
408 
409  if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
410  {
411  if (core.next_refID > -1)
412  get<0>(mate) = core.next_refID;
413 
414  if (core.next_pos > -1) // [[likely]]
415  get<1>(mate) = core.next_pos;
416 
417  get<2>(mate) = core.tlen;
418  }
419 
420  // read id
421  // -------------------------------------------------------------------------------------------------------------
422  read_field(stream_view | detail::take_exactly_or_throw(core.l_read_name - 1), id); // field::id
423  std::ranges::next(std::ranges::begin(stream_view)); // skip '\0'
424 
425  // read cigar string
426  // -------------------------------------------------------------------------------------------------------------
427  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
428  {
429  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
430  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
431  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
432  }
433  else
434  {
435  detail::consume(stream_view | detail::take_exactly_or_throw(core.n_cigar_op * 4));
436  }
437 
438  offset = offset_tmp;
439 
440  // read sequence
441  // -------------------------------------------------------------------------------------------------------------
442  if (core.l_seq > 0) // sequence information is given
443  {
444  auto seq_stream = stream_view
445  | detail::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
447  {
448  return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
449  dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
450  });
451 
452  if constexpr (detail::decays_to_ignore_v<seq_type>)
453  {
454  auto skip_sequence_bytes = [&] ()
455  {
456  detail::consume(seq_stream);
457  if (core.l_seq & 1)
458  std::ranges::next(std::ranges::begin(stream_view));
459  };
460 
461  if constexpr (!detail::decays_to_ignore_v<align_type>)
462  {
463  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
464  "If you want to read ALIGNMENT but not SEQ, the alignment"
465  " object must store a sequence container at the second (query) position.");
466 
467  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
468  {
469  assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
470  using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
471  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
472 
473  get<1>(align).reserve(seq_length);
474 
475  auto tmp_iter = std::ranges::begin(seq_stream);
476  std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
477 
478  if (offset_tmp & 1)
479  {
480  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
481  ++tmp_iter;
482  --seq_length;
483  }
484 
485  for (size_t i = (seq_length / 2); i > 0; --i)
486  {
487  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
488  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
489  ++tmp_iter;
490  }
491 
492  if (seq_length & 1)
493  {
494  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
495  ++tmp_iter;
496  --soft_clipping_end;
497  }
498 
499  std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
500  }
501  else
502  {
503  skip_sequence_bytes();
504  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
505  }
506  }
507  else
508  {
509  skip_sequence_bytes();
510  }
511  }
512  else
513  {
514  using alph_t = std::ranges::range_value_t<decltype(seq)>;
515  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
516 
517  for (auto [d1, d2] : seq_stream)
518  {
519  seq.push_back(from_dna16[to_rank(d1)]);
520  seq.push_back(from_dna16[to_rank(d2)]);
521  }
522 
523  if (core.l_seq & 1)
524  {
525  dna16sam d = dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
526  seq.push_back(from_dna16[to_rank(d)]);
527  std::ranges::next(std::ranges::begin(stream_view));
528  }
529 
530  if constexpr (!detail::decays_to_ignore_v<align_type>)
531  {
532  assign_unaligned(get<1>(align),
533  seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
534  std::ranges::distance(seq) - soft_clipping_end));
535  }
536  }
537  }
538 
539  // read qual string
540  // -------------------------------------------------------------------------------------------------------------
541  read_field(stream_view | detail::take_exactly_or_throw(core.l_seq)
542  | std::views::transform([] (char chr) { return static_cast<char>(chr + 33); }), qual);
543 
544  // All remaining optional fields if any: SAM tags dictionary
545  // -------------------------------------------------------------------------------------------------------------
546  int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4/*block_size excluded*/) -
547  core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
548  assert(remaining_bytes >= 0);
549  auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
550 
551  while (tags_view.size() > 0)
552  read_field(tags_view, tag_dict);
553 
554  // DONE READING - wrap up
555  // -------------------------------------------------------------------------------------------------------------
556  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
557  {
558  // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
559  // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
560  // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
561  if (core.l_seq != 0 && offset_tmp == core.l_seq)
562  {
563  if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
564  { // maybe only throw in debug mode and otherwise return an empty alignment?
565  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
566  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
567  "stored in the optional field CG. You need to read in the field::tags and "
568  "field::seq in order to access this information.")};
569  }
570  else
571  {
572  auto it = tag_dict.find("CG"_tag);
573 
574  if (it == tag_dict.end())
575  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
576  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
577  "stored in the optional field CG but this tag is not present in the given ",
578  "record.")};
579 
580  auto cigar_view = std::views::all(std::get<std::string>(it->second));
581  std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
582  offset_tmp = soft_clipping_end = 0;
583  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
584  tag_dict.erase(it); // remove redundant information
585 
586  if constexpr (!detail::decays_to_ignore_v<align_type>)
587  {
588  assign_unaligned(get<1>(align),
589  seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
590  std::ranges::distance(seq) - soft_clipping_end));
591  }
592  }
593  }
594  }
595 
596  // Alignment object construction
597  if constexpr (!detail::decays_to_ignore_v<align_type>)
598  construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM
599 
600  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
601  std::swap(cigar_vector, tmp_cigar_vector);
602 }
603 
605 template <typename stream_type,
606  typename header_type,
607  typename seq_type,
608  typename id_type,
609  typename ref_seq_type,
610  typename ref_id_type,
611  typename align_type,
612  typename cigar_type,
613  typename qual_type,
614  typename mate_type,
615  typename tag_dict_type>
616 inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
617  [[maybe_unused]] sam_file_output_options const & options,
618  [[maybe_unused]] header_type && header,
619  [[maybe_unused]] seq_type && seq,
620  [[maybe_unused]] qual_type && qual,
621  [[maybe_unused]] id_type && id,
622  [[maybe_unused]] int32_t const offset,
623  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
624  [[maybe_unused]] ref_id_type && ref_id,
625  [[maybe_unused]] std::optional<int32_t> ref_offset,
626  [[maybe_unused]] align_type && align,
627  [[maybe_unused]] cigar_type && cigar_vector,
628  [[maybe_unused]] sam_flag const flag,
629  [[maybe_unused]] uint8_t const mapq,
630  [[maybe_unused]] mate_type && mate,
631  [[maybe_unused]] tag_dict_type && tag_dict,
632  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
633  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
634 {
635  // ---------------------------------------------------------------------
636  // Type Requirements (as static asserts for user friendliness)
637  // ---------------------------------------------------------------------
638  static_assert((std::ranges::forward_range<seq_type> &&
639  alphabet<std::ranges::range_reference_t<seq_type>>),
640  "The seq object must be a std::ranges::forward_range over "
641  "letters that model seqan3::alphabet.");
642 
643  static_assert((std::ranges::forward_range<id_type> &&
644  alphabet<std::ranges::range_reference_t<id_type>>),
645  "The id object must be a std::ranges::forward_range over "
646  "letters that model seqan3::alphabet.");
647 
648  static_assert((std::ranges::forward_range<ref_seq_type> &&
649  alphabet<std::ranges::range_reference_t<ref_seq_type>>),
650  "The ref_seq object must be a std::ranges::forward_range "
651  "over letters that model seqan3::alphabet.");
652 
653  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
654  {
655  static_assert((std::ranges::forward_range<ref_id_type> ||
656  std::integral<std::remove_reference_t<ref_id_type>> ||
657  detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
658  "The ref_id object must be a std::ranges::forward_range "
659  "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
660  }
661 
663  "The align object must be a std::pair of two ranges whose "
664  "value_type is comparable to seqan3::gap");
665 
666  static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
667  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
668  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
669  "The align object must be a std::pair of two ranges whose "
670  "value_type is comparable to seqan3::gap");
671 
672  static_assert((std::ranges::forward_range<qual_type> &&
673  alphabet<std::ranges::range_reference_t<qual_type>>),
674  "The qual object must be a std::ranges::forward_range "
675  "over letters that model seqan3::alphabet.");
676 
678  "The mate object must be a std::tuple of size 3 with "
679  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
680  "2) a std::integral or std::optional<std::integral>, and "
681  "3) a std::integral.");
682 
683  static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
684  std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
685  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
686  (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
687  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
688  std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
689  "The mate object must be a std::tuple of size 3 with "
690  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
691  "2) a std::integral or std::optional<std::integral>, and "
692  "3) a std::integral.");
693 
694  static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
695  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
696 
697  if constexpr (detail::decays_to_ignore_v<header_type>)
698  {
699  throw format_error{"BAM can only be written with a header but you did not provide enough information! "
700  "You can either construct the output file with ref_ids and ref_seqs information and "
701  "the header will be created for you, or you can access the `header` member directly."};
702  }
703  else
704  {
705  // ---------------------------------------------------------------------
706  // logical Requirements
707  // ---------------------------------------------------------------------
708 
709  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
710  throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
711 
712  detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
713 
714  // ---------------------------------------------------------------------
715  // Writing the BAM Header on first call
716  // ---------------------------------------------------------------------
717  if (!header_was_written)
718  {
719  stream << "BAM\1";
721  write_header(os, options, header); // write SAM header to temporary stream to query the size.
722  int32_t l_text{static_cast<int32_t>(os.str().size())};
723  std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
724 
725  stream << os.str();
726 
727  int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
728  std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
729 
730  for (int32_t ridx = 0; ridx < n_ref; ++ridx)
731  {
732  int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
733  std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
734  // write reference name:
735  std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
736  stream_it = '\0';
737  // write reference sequence length:
738  std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
739  }
740 
741  header_was_written = true;
742  }
743 
744  // ---------------------------------------------------------------------
745  // Writing the Record
746  // ---------------------------------------------------------------------
747  int32_t ref_length{};
748 
749  // if alignment is non-empty, replace cigar_vector.
750  // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
751  if (!std::ranges::empty(cigar_vector))
752  {
753  int32_t dummy_seq_length{};
754  for (auto & [count, operation] : cigar_vector)
755  detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
756  }
757  else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
758  {
759  ref_length = std::ranges::distance(get<1>(align));
760 
761  // compute possible distance from alignment end to sequence end
762  // which indicates soft clipping at the end.
763  // This should be replaced by a free count_gaps function for
764  // aligned sequences which is more efficient if possible.
765  int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
766 
767  for (auto chr : get<1>(align))
768  if (chr == gap{})
769  ++off_end;
770 
771  off_end -= ref_length;
772  cigar_vector = detail::get_cigar_vector(align, offset, off_end);
773  }
774 
775  if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
776  {
777  tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
778  cigar_vector.resize(2);
779  cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
780  cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
781  }
782 
783  std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
784 
785  // Compute the value for the l_read_name field for the bam record.
786  // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
787  // the data type to store the value is uint8_t and 255 is the maximal size.
788  // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
789  // 2 bytes.
790  uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
791  read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
792 
793  alignment_record_core core
794  {
795  /* block_size */ 0, // will be initialised right after
796  /* refID */ -1, // will be initialised right after
797  /* pos */ ref_offset.value_or(-1),
798  /* l_read_name */ read_name_size,
799  /* mapq */ mapq,
800  /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
801  /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
802  /* flag */ flag,
803  /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
804  /* next_refId */ -1, // will be initialised right after
805  /* next_pos */ get<1>(mate).value_or(-1),
806  /* tlen */ get<2>(mate)
807  };
808 
809  auto check_and_assign_id_to = [&header] ([[maybe_unused]] auto & id_source,
810  [[maybe_unused]] auto & id_target)
811  {
812  using id_t = std::remove_reference_t<decltype(id_source)>;
813 
814  if constexpr (!detail::decays_to_ignore_v<id_t>)
815  {
816  if constexpr (std::integral<id_t>)
817  {
818  id_target = id_source;
819  }
820  else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
821  {
822  id_target = id_source.value_or(-1);
823  }
824  else
825  {
826  if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
827  {
828  auto id_it = header.ref_dict.end();
829 
830  if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
831  std::ranges::sized_range<decltype(id_source)> &&
832  std::ranges::borrowed_range<decltype(id_source)>)
833  {
834  id_it = header.ref_dict.find(std::span{std::ranges::data(id_source),
835  std::ranges::size(id_source)});
836  }
837  else
838  {
839  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
840 
841  static_assert(implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
842  "The ref_id type is not convertible to the reference id information stored in the "
843  "reference dictionary of the header object.");
844 
845  id_it = header.ref_dict.find(id_source);
846  }
847 
848  if (id_it == header.ref_dict.end())
849  {
850  throw format_error{detail::to_string("Unknown reference name '", id_source, "' could "
851  "not be found in BAM header ref_dict: ",
852  header.ref_dict, ".")};
853  }
854 
855  id_target = id_it->second;
856  }
857  }
858  }
859  };
860 
861  // initialise core.refID
862  check_and_assign_id_to(ref_id, core.refID);
863 
864  // initialise core.next_refID
865  check_and_assign_id_to(get<0>(mate), core.next_refID);
866 
867  // initialise core.block_size
868  core.block_size = sizeof(core) - 4/*block_size excluded*/ +
869  core.l_read_name +
870  core.n_cigar_op * 4 + // each int32_t has 4 bytes
871  (core.l_seq + 1) / 2 + // bitcompressed seq
872  core.l_seq + // quality string
873  tag_dict_binary_str.size();
874 
875  std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
876 
877  if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
878  stream_it = '*';
879  else
880  std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
881  stream_it = '\0';
882 
883  // write cigar
884  for (auto [cigar_count, op] : cigar_vector)
885  {
886  cigar_count = cigar_count << 4;
887  cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
888  std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
889  }
890 
891  // write seq (bit-compressed: dna16sam characters go into one byte)
892  using alph_t = std::ranges::range_value_t<seq_type>;
893  constexpr auto to_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
894 
895  auto sit = std::ranges::begin(seq);
896  for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
897  {
898  uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
899  ++sidx, ++sit;
900  compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
901  stream_it = static_cast<char>(compressed_chr);
902  }
903 
904  if (core.l_seq & 1) // write one more
905  stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
906 
907  // write qual
908  if (std::ranges::empty(qual))
909  {
910  auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
911  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
912  }
913  else
914  {
915  if (std::ranges::distance(qual) != core.l_seq)
916  throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
917  core.l_seq, ". Got quality with size ",
918  std::ranges::distance(qual), " instead.")};
919 
920  auto v = qual | std::views::transform([] (auto chr) { return static_cast<char>(to_rank(chr)); });
921  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
922  }
923 
924  // write optional fields
925  stream << tag_dict_binary_str;
926  } // if constexpr (!detail::decays_to_ignore_v<header_type>)
927 }
928 
930 template <typename stream_view_type, typename value_type>
931 inline void format_bam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
932  stream_view_type && stream_view,
933  value_type const & SEQAN3_DOXYGEN_ONLY(value))
934 {
935  int32_t count;
936  read_field(stream_view, count); // read length of vector
937  std::vector<value_type> tmp_vector;
938  tmp_vector.reserve(count);
939 
940  while (count > 0)
941  {
942  value_type tmp{};
943  read_field(stream_view, tmp);
944  tmp_vector.push_back(std::move(tmp));
945  --count;
946  }
947  variant = std::move(tmp_vector);
948 }
949 
967 template <typename stream_view_type>
968 inline void format_bam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
969 {
970  /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
971  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
972  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
973  VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
974  by the length (int32_t) of the array, followed by the values.
975  */
976  auto it = std::ranges::begin(stream_view);
977  uint16_t tag = static_cast<uint16_t>(*it) << 8;
978  ++it; // skip char read before
979 
980  tag += static_cast<uint16_t>(*it);
981  ++it; // skip char read before
982 
983  char type_id = *it;
984  ++it; // skip char read before
985 
986  switch (type_id)
987  {
988  case 'A' : // char
989  {
990  target[tag] = *it;
991  ++it; // skip char that has been read
992  break;
993  }
994  // all integer sizes are possible
995  case 'c' : // int8_t
996  {
997  int8_t tmp;
998  read_field(stream_view, tmp);
999  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1000  break;
1001  }
1002  case 'C' : // uint8_t
1003  {
1004  uint8_t tmp;
1005  read_field(stream_view, tmp);
1006  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1007  break;
1008  }
1009  case 's' : // int16_t
1010  {
1011  int16_t tmp;
1012  read_field(stream_view, tmp);
1013  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1014  break;
1015  }
1016  case 'S' : // uint16_t
1017  {
1018  uint16_t tmp;
1019  read_field(stream_view, tmp);
1020  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1021  break;
1022  }
1023  case 'i' : // int32_t
1024  {
1025  int32_t tmp;
1026  read_field(stream_view, tmp);
1027  target[tag] = std::move(tmp); // readable sam format only allows int32_t
1028  break;
1029  }
1030  case 'I' : // uint32_t
1031  {
1032  uint32_t tmp;
1033  read_field(stream_view, tmp);
1034  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1035  break;
1036  }
1037  case 'f' : // float
1038  {
1039  float tmp;
1040  read_field(stream_view, tmp);
1041  target[tag] = tmp;
1042  break;
1043  }
1044  case 'Z' : // string
1045  {
1046  string_buffer.clear();
1047  while (!is_char<'\0'>(*it))
1048  {
1049  string_buffer.push_back(*it);
1050  ++it;
1051  }
1052  ++it; // skip \0
1053  target[tag] = string_buffer;
1054  break;
1055  }
1056  case 'H' : // byte array, represented as null-terminated string; specification requires even number of bytes
1057  {
1058  std::vector<std::byte> byte_array;
1059  std::byte value;
1060  while (!is_char<'\0'>(*it))
1061  {
1062  string_buffer.clear();
1063  string_buffer.push_back(*it);
1064  ++it;
1065 
1066  if (*it == '\0')
1067  throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1068 
1069  string_buffer.push_back(*it);
1070  ++it;
1071  read_field(string_buffer, value);
1072  byte_array.push_back(value);
1073  }
1074  ++it; // skip \0
1075  target[tag] = byte_array;
1076  break;
1077  }
1078  case 'B' : // Array. Value type depends on second char [cCsSiIf]
1079  {
1080  char array_value_type_id = *it;
1081  ++it; // skip char read before
1082 
1083  switch (array_value_type_id)
1084  {
1085  case 'c' : // int8_t
1086  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1087  break;
1088  case 'C' : // uint8_t
1089  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1090  break;
1091  case 's' : // int16_t
1092  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1093  break;
1094  case 'S' : // uint16_t
1095  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1096  break;
1097  case 'i' : // int32_t
1098  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1099  break;
1100  case 'I' : // uint32_t
1101  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1102  break;
1103  case 'f' : // float
1104  read_sam_dict_vector(target[tag], stream_view, float{});
1105  break;
1106  default:
1107  throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1108  "must be one of [cCsSiIf] but '", array_value_type_id, "' was given.")};
1109  }
1110  break;
1111  }
1112  default:
1113  throw format_error{detail::to_string("The second character in the numerical id of a "
1114  "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id, "' was given.")};
1115  }
1116 }
1117 
1132 template <typename cigar_input_type>
1133 inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1134 {
1135  std::vector<cigar> operations{};
1136  char operation{'\0'};
1137  uint32_t count{};
1138  int32_t ref_length{}, seq_length{};
1139  uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1140  constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1141  constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1142 
1143  if (n_cigar_op == 0) // [[unlikely]]
1144  return std::tuple{operations, ref_length, seq_length};
1145 
1146  // parse the rest of the cigar
1147  // -------------------------------------------------------------------------------------------------------------
1148  while (n_cigar_op > 0) // until stream is not empty
1149  {
1150  std::ranges::copy_n(std::ranges::begin(cigar_input),
1151  sizeof(operation_and_count),
1152  reinterpret_cast<char*>(&operation_and_count));
1153  operation = cigar_mapping[operation_and_count & cigar_mask];
1154  count = operation_and_count >> 4;
1155 
1156  detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1157  operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1158  --n_cigar_op;
1159  }
1160 
1161  return std::tuple{operations, ref_length, seq_length};
1162 }
1163 
1167 inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary const & tag_dict)
1168 {
1169  std::string result{};
1170 
1171  auto stream_variant_fn = [&result] (auto && arg) // helper to print a std::variant
1172  {
1173  // T is either char, int32_t, float, std::string, or a std::vector<some int>
1174  using T = std::remove_cvref_t<decltype(arg)>;
1175 
1176  if constexpr (std::same_as<T, int32_t>)
1177  {
1178  // always choose the smallest possible representation [cCsSiI]
1179  size_t const absolute_arg = std::abs(arg);
1180  auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1181  bool const negative = arg < 0;
1182  n = n * n + 2 * negative; // for switch case order
1183 
1184  switch (n)
1185  {
1186  case 0:
1187  {
1188  result[result.size() - 1] = 'C';
1189  result.append(reinterpret_cast<char const *>(&arg), 1);
1190  break;
1191  }
1192  case 1:
1193  {
1194  result[result.size() - 1] = 'S';
1195  result.append(reinterpret_cast<char const *>(&arg), 2);
1196  break;
1197  }
1198  case 2:
1199  {
1200  result[result.size() - 1] = 'c';
1201  int8_t tmp = static_cast<int8_t>(arg);
1202  result.append(reinterpret_cast<char const *>(&tmp), 1);
1203  break;
1204  }
1205  case 3:
1206  {
1207  result[result.size() - 1] = 's';
1208  int16_t tmp = static_cast<int16_t>(arg);
1209  result.append(reinterpret_cast<char const *>(&tmp), 2);
1210  break;
1211  }
1212  default:
1213  {
1214  result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1215  break;
1216  }
1217  }
1218  }
1219  else if constexpr (std::same_as<T, std::string>)
1220  {
1221  result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1/*+ null character*/);
1222  }
1223  else if constexpr (!std::ranges::range<T>) // char, float
1224  {
1225  result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1226  }
1227  else // std::vector of some arithmetic_type type
1228  {
1229  int32_t sz{static_cast<int32_t>(arg.size())};
1230  result.append(reinterpret_cast<char *>(&sz), 4);
1231  result.append(reinterpret_cast<char const *>(arg.data()),
1232  arg.size() * sizeof(std::ranges::range_value_t<T>));
1233  }
1234  };
1235 
1236  for (auto & [tag, variant] : tag_dict)
1237  {
1238  result.push_back(static_cast<char>(tag / 256));
1239  result.push_back(static_cast<char>(tag % 256));
1240 
1241  result.push_back(detail::sam_tag_type_char[variant.index()]);
1242 
1243  if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1244  result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1245 
1246  std::visit(stream_variant_fn, variant);
1247  }
1248 
1249  return result;
1250 }
1251 
1252 } // namespace seqan3
The <bit> header from C++20's standard library.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:165
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:191
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:60
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:99
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=').
Definition: dna16sam.hpp:48
The BAM format.
Definition: format_bam.hpp:50
format_bam()=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:277
void write_alignment_record([[maybe_unused]] stream_type &stream, [[maybe_unused]] sam_file_output_options const &options, [[maybe_unused]] header_type &&header, [[maybe_unused]] seq_type &&seq, [[maybe_unused]] qual_type &&qual, [[maybe_unused]] id_type &&id, [[maybe_unused]] int32_t const offset, [[maybe_unused]] ref_seq_type &&ref_seq, [[maybe_unused]] ref_id_type &&ref_id, [[maybe_unused]] std::optional< int32_t > ref_offset, [[maybe_unused]] align_type &&align, [[maybe_unused]] cigar_type &&cigar_vector, [[maybe_unused]] sam_flag const flag, [[maybe_unused]] uint8_t const mapq, [[maybe_unused]] mate_type &&mate, [[maybe_unused]] tag_dict_type &&tag_dict, [[maybe_unused]] double e_value, [[maybe_unused]] double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:616
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
~format_bam()=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:67
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:35
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:160
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:157
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:121
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:337
T clear(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T emplace_back(T... args)
T end(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:162
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:228
typename remove_cvref< t >::type remove_cvref_t
Return the input type with const, volatile and references removed (transformation_trait shortcut).
Definition: type_traits:73
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:169
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>().
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
T min(T... args)
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::debug_stream and related types.
T push_back(T... args)
The <ranges> header from C++20's standard library.
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_file_input_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:91
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:26
T swap(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
T tie(T... args)
T tuple_size_v
T visit(T... args)