SeqAn3  3.1.0
The Modern C++ library for sequence analysis.
format_sam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <iterator>
16 #include <seqan3/std/ranges>
17 #include <string>
18 #include <vector>
19 
38 
39 namespace seqan3
40 {
41 
116 class format_sam : private detail::format_sam_base
117 {
118 public:
122  // construction cannot be noexcept because this class has a std::string variable as a quality string buffer.
123  format_sam() = default;
124  format_sam(format_sam const &) = default;
125  format_sam & operator=(format_sam const &) = default;
126  format_sam(format_sam &&) = default;
127  format_sam & operator=(format_sam &&) = default;
128  ~format_sam() = default;
129 
131 
134  {
135  { "sam" },
136  };
137 
138 protected:
139  template <typename stream_type, // constraints checked by file
140  typename seq_legal_alph_type,
141  typename seq_type, // other constraints checked inside function
142  typename id_type,
143  typename qual_type>
144  void read_sequence_record(stream_type & stream,
146  seq_type & sequence,
147  id_type & id,
148  qual_type & qualities);
149 
150  template <typename stream_type, // constraints checked by file
151  typename seq_type, // other constraints checked inside function
152  typename id_type,
153  typename qual_type>
154  void write_sequence_record(stream_type & stream,
155  sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
156  seq_type && sequence,
157  id_type && id,
158  qual_type && qualities);
159 
160  template <typename stream_type, // constraints checked by file
161  typename seq_legal_alph_type,
162  typename ref_seqs_type,
163  typename ref_ids_type,
164  typename seq_type,
165  typename id_type,
166  typename offset_type,
167  typename ref_seq_type,
168  typename ref_id_type,
169  typename ref_offset_type,
170  typename align_type,
171  typename cigar_type,
172  typename flag_type,
173  typename mapq_type,
174  typename qual_type,
175  typename mate_type,
176  typename tag_dict_type,
177  typename e_value_type,
178  typename bit_score_type>
179  void read_alignment_record(stream_type & stream,
180  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
181  ref_seqs_type & ref_seqs,
183  seq_type & seq,
184  qual_type & qual,
185  id_type & id,
186  offset_type & offset,
187  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
188  ref_id_type & ref_id,
189  ref_offset_type & ref_offset,
190  align_type & align,
191  cigar_type & cigar_vector,
192  flag_type & flag,
193  mapq_type & mapq,
194  mate_type & mate,
195  tag_dict_type & tag_dict,
196  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
197  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
198 
199  template <typename stream_type,
200  typename header_type,
201  typename seq_type,
202  typename id_type,
203  typename ref_seq_type,
204  typename ref_id_type,
205  typename align_type,
206  typename qual_type,
207  typename mate_type,
208  typename tag_dict_type,
209  typename e_value_type,
210  typename bit_score_type>
211  void write_alignment_record(stream_type & stream,
212  sam_file_output_options const & options,
213  header_type && header,
214  seq_type && seq,
215  qual_type && qual,
216  id_type && id,
217  int32_t const offset,
218  ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
219  ref_id_type && ref_id,
220  std::optional<int32_t> ref_offset,
221  align_type && align,
222  std::vector<cigar> const & cigar_vector,
223  sam_flag const flag,
224  uint8_t const mapq,
225  mate_type && mate,
226  tag_dict_type && tag_dict,
227  e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
228  bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score));
229 
230 private:
232  std::string tmp_qual{};
233 
235  static constexpr std::string_view dummy{};
236 
238  sam_file_header<> default_header{};
239 
241  bool ref_info_present_in_header{false};
242 
244  std::string_view const & default_or(detail::ignore_t) const noexcept
245  {
246  return dummy;
247  }
248 
250  template <typename t>
251  decltype(auto) default_or(t && v) const noexcept
252  {
253  return std::forward<t>(v);
254  }
255 
256  using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
257 
258  template <typename stream_view_type, typename value_type>
259  void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
260  stream_view_type && stream_view,
261  value_type value);
262 
263  template <typename stream_view_type>
264  void read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant,
265  stream_view_type && stream_view);
266 
267  template <typename stream_view_type>
268  void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
269 
270  template <typename stream_it_t, std::ranges::forward_range field_type>
271  void write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value);
272 
273  template <typename stream_it_t>
274  void write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value);
275 
276  template <typename stream_it_t>
277  void write_tag_fields(stream_it_t & stream, sam_tag_dictionary const & tag_dict, char const separator);
278 };
279 
281 template <typename stream_type, // constraints checked by file
282  typename seq_legal_alph_type,
283  typename seq_type, // other constraints checked inside function
284  typename id_type,
285  typename qual_type>
286 inline void format_sam::read_sequence_record(stream_type & stream,
288  seq_type & sequence,
289  id_type & id,
290  qual_type & qualities)
291 {
293 
294  {
295  read_alignment_record(stream, align_options, std::ignore, default_header, sequence, qualities, id,
296  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore,
297  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore);
298  }
299 
300  if constexpr (!detail::decays_to_ignore_v<seq_type>)
301  if (std::ranges::distance(sequence) == 0)
302  throw parse_error{"The sequence information must not be empty."};
303  if constexpr (!detail::decays_to_ignore_v<id_type>)
304  if (std::ranges::distance(id) == 0)
305  throw parse_error{"The id information must not be empty."};
306 
307  if (options.truncate_ids)
308  id = id | detail::take_until_and_consume(is_space) | views::to<id_type>;
309 }
310 
312 template <typename stream_type, // constraints checked by file
313  typename seq_type, // other constraints checked inside function
314  typename id_type,
315  typename qual_type>
316 inline void format_sam::write_sequence_record(stream_type & stream,
317  sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
318  seq_type && sequence,
319  id_type && id,
320  qual_type && qualities)
321 {
322  using default_align_t = std::pair<std::span<gapped<char>>, std::span<gapped<char>>>;
323  using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
324 
325  sam_file_output_options output_options;
326 
327  write_alignment_record(stream,
328  output_options,
329  /*header*/ std::ignore,
330  /*seq*/ default_or(sequence),
331  /*qual*/ default_or(qualities),
332  /*id*/ default_or(id),
333  /*offset*/ 0,
334  /*ref_seq*/ std::string_view{},
335  /*ref_id*/ std::string_view{},
336  /*ref_offset*/ -1,
337  /*align*/ default_align_t{},
338  /*cigar_vector*/ std::vector<cigar>{},
339  /*flag*/ sam_flag::none,
340  /*mapq*/ 0,
341  /*mate*/ default_mate_t{},
342  /*tag_dict*/ sam_tag_dictionary{},
343  /*e_value*/ 0,
344  /*bit_score*/ 0);
345 }
346 
348 template <typename stream_type, // constraints checked by file
349  typename seq_legal_alph_type,
350  typename ref_seqs_type,
351  typename ref_ids_type,
352  typename seq_type,
353  typename id_type,
354  typename offset_type,
355  typename ref_seq_type,
356  typename ref_id_type,
357  typename ref_offset_type,
358  typename align_type,
359  typename cigar_type,
360  typename flag_type,
361  typename mapq_type,
362  typename qual_type,
363  typename mate_type,
364  typename tag_dict_type,
365  typename e_value_type,
366  typename bit_score_type>
367 inline void format_sam::read_alignment_record(stream_type & stream,
368  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
369  ref_seqs_type & ref_seqs,
371  seq_type & seq,
372  qual_type & qual,
373  id_type & id,
374  offset_type & offset,
375  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
376  ref_id_type & ref_id,
377  ref_offset_type & ref_offset,
378  align_type & align,
379  cigar_type & cigar_vector,
380  flag_type & flag,
381  mapq_type & mapq,
382  mate_type & mate,
383  tag_dict_type & tag_dict,
384  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
385  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
386 {
387  static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
388  detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
389  "The ref_offset must be a specialisation of std::optional.");
390 
391  auto stream_view = detail::istreambuf(stream);
392  auto field_view = stream_view | detail::take_until_or_throw_and_consume(is_char<'\t'>);
393 
394  // these variables need to be stored to compute the ALIGNMENT
395  int32_t ref_offset_tmp{};
396  std::ranges::range_value_t<decltype(header.ref_ids())> ref_id_tmp{};
397  [[maybe_unused]] int32_t offset_tmp{};
398  [[maybe_unused]] int32_t soft_clipping_end{};
399  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
400  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
401 
402  // Header
403  // -------------------------------------------------------------------------------------------------------------
404  if (is_char<'@'>(*std::ranges::begin(stream_view))) // we always read the header if present
405  {
406  read_header(stream_view, header, ref_seqs);
407 
408  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
409  return;
410  }
411 
412  // Fields 1-5: ID FLAG REF_ID REF_OFFSET MAPQ
413  // -------------------------------------------------------------------------------------------------------------
414  read_field(field_view, id);
415 
416  uint16_t flag_integral{};
417  read_field(field_view, flag_integral);
418  flag = sam_flag{flag_integral};
419 
420  read_field(field_view, ref_id_tmp);
421  check_and_assign_ref_id(ref_id, ref_id_tmp, header, ref_seqs);
422 
423  read_field(field_view, ref_offset_tmp);
424  --ref_offset_tmp; // SAM format is 1-based but SeqAn operates 0-based
425 
426  if (ref_offset_tmp == -1)
427  ref_offset = std::nullopt; // indicates an unmapped read -> ref_offset is not set
428  else if (ref_offset_tmp > -1)
429  ref_offset = ref_offset_tmp;
430  else if (ref_offset_tmp < -1)
431  throw format_error{"No negative values are allowed for field::ref_offset."};
432 
433  read_field(field_view, mapq);
434 
435  // Field 6: CIGAR
436  // -------------------------------------------------------------------------------------------------------------
437  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
438  {
439  if (!is_char<'*'>(*std::ranges::begin(stream_view))) // no cigar information given
440  {
441  std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(field_view);
442  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
443  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
444  }
445  else
446  {
447  std::ranges::next(std::ranges::begin(field_view)); // skip '*'
448  }
449  }
450  else
451  {
452  detail::consume(field_view);
453  }
454 
455  offset = offset_tmp;
456 
457  // Field 7-9: (RNEXT PNEXT TLEN) = MATE
458  // -------------------------------------------------------------------------------------------------------------
459  if constexpr (!detail::decays_to_ignore_v<mate_type>)
460  {
461  std::ranges::range_value_t<decltype(header.ref_ids())> tmp_mate_ref_id{};
462  read_field(field_view, tmp_mate_ref_id); // RNEXT
463 
464  if (tmp_mate_ref_id == "=") // indicates "same as ref id"
465  {
466  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
467  get<0>(mate) = ref_id;
468  else
469  check_and_assign_ref_id(get<0>(mate), ref_id_tmp, header, ref_seqs);
470  }
471  else
472  {
473  check_and_assign_ref_id(get<0>(mate), tmp_mate_ref_id, header, ref_seqs);
474  }
475 
476  int32_t tmp_pnext{};
477  read_field(field_view, tmp_pnext); // PNEXT
478 
479  if (tmp_pnext > 0)
480  get<1>(mate) = --tmp_pnext; // SAM format is 1-based but SeqAn operates 0-based.
481  else if (tmp_pnext < 0)
482  throw format_error{"No negative values are allowed at the mate mapping position."};
483  // tmp_pnext == 0 indicates an unmapped mate -> do not fill std::optional get<1>(mate)
484 
485  read_field(field_view, get<2>(mate)); // TLEN
486  }
487  else
488  {
489  for (size_t i = 0; i < 3u; ++i)
490  {
491  detail::consume(field_view);
492  }
493  }
494 
495  // Field 10: Sequence
496  // -------------------------------------------------------------------------------------------------------------
497  if (!is_char<'*'>(*std::ranges::begin(stream_view))) // sequence information is given
498  {
499  auto constexpr is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
500  auto seq_stream = field_view | std::views::transform([is_legal_alph] (char const c) // enforce legal alphabet
501  {
502  if (!is_legal_alph(c))
503  throw parse_error{std::string{"Encountered an unexpected letter: "} +
504  "char_is_valid_for<" +
505  detail::type_name_as_string<seq_legal_alph_type> +
506  "> evaluated to false on " +
507  detail::make_printable(c)};
508  return c;
509  });
510 
511  if constexpr (detail::decays_to_ignore_v<seq_type>)
512  {
513  if constexpr (!detail::decays_to_ignore_v<align_type>)
514  {
515  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
516  "If you want to read ALIGNMENT but not SEQ, the alignment"
517  " object must store a sequence container at the second (query) position.");
518 
519  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
520  {
521 
522  auto tmp_iter = std::ranges::begin(seq_stream);
523  std::ranges::advance(tmp_iter, offset_tmp);
524 
525  for (; seq_length > 0; --seq_length) // seq_length is not needed anymore
526  {
527  get<1>(align).push_back(std::ranges::range_value_t<decltype(get<1>(align))>{}.assign_char(*tmp_iter));
528  ++tmp_iter;
529  }
530 
531  std::ranges::advance(tmp_iter, soft_clipping_end);
532  }
533  else
534  {
535  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // empty container
536  }
537  }
538  else
539  {
540  detail::consume(seq_stream);
541  }
542  }
543  else
544  {
545  read_field(seq_stream, seq);
546 
547  if constexpr (!detail::decays_to_ignore_v<align_type>)
548  {
549  if (!tmp_cigar_vector.empty()) // if no alignment info is given, the field::alignment should remain empty
550  {
551  assign_unaligned(get<1>(align),
552  seq | views::slice(static_cast<decltype(std::ranges::size(seq))>(offset_tmp),
553  std::ranges::size(seq) - soft_clipping_end));
554  }
555  }
556  }
557  }
558  else
559  {
560  std::ranges::next(std::ranges::begin(field_view)); // skip '*'
561  }
562 
563  // Field 11: Quality
564  // -------------------------------------------------------------------------------------------------------------
565  auto const tab_or_end = is_char<'\t'> || is_char<'\r'> || is_char<'\n'>;
566  read_field(stream_view | detail::take_until_or_throw(tab_or_end), qual);
567 
568  if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
569  {
570  if (std::ranges::distance(seq) != 0 && std::ranges::distance(qual) != 0 &&
571  std::ranges::distance(seq) != std::ranges::distance(qual))
572  {
573  throw format_error{detail::to_string("Sequence length (", std::ranges::distance(seq),
574  ") and quality length (", std::ranges::distance(qual),
575  ") must be the same.")};
576  }
577  }
578 
579  // All remaining optional fields if any: SAM tags dictionary
580  // -------------------------------------------------------------------------------------------------------------
581  while (is_char<'\t'>(*std::ranges::begin(stream_view))) // read all tags if present
582  {
583  std::ranges::next(std::ranges::begin(stream_view)); // skip tab
584  read_field(stream_view | detail::take_until_or_throw(tab_or_end), tag_dict);
585  }
586 
587  detail::consume(stream_view | detail::take_until(!(is_char<'\r'> || is_char<'\n'>))); // consume new line
588 
589  // DONE READING - wrap up
590  // -------------------------------------------------------------------------------------------------------------
591  // Alignment object construction
592  // Note that the query sequence in get<1>(align) has already been filled while reading Field 10.
593  if constexpr (!detail::decays_to_ignore_v<align_type>)
594  {
595  int32_t ref_idx{(ref_id_tmp.empty()/*unmapped read?*/) ? -1 : 0};
596 
597  if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
598  {
599  if (!ref_id_tmp.empty())
600  {
601  assert(header.ref_dict.count(ref_id_tmp) != 0); // taken care of in check_and_assign_ref_id()
602  ref_idx = header.ref_dict[ref_id_tmp]; // get index for reference sequence
603  }
604  }
605 
606  construct_alignment(align, tmp_cigar_vector, ref_idx, ref_seqs, ref_offset_tmp, ref_length);
607  }
608 
609  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
610  std::swap(cigar_vector, tmp_cigar_vector);
611 }
612 
614 template <typename stream_type,
615  typename header_type,
616  typename seq_type,
617  typename id_type,
618  typename ref_seq_type,
619  typename ref_id_type,
620  typename align_type,
621  typename qual_type,
622  typename mate_type,
623  typename tag_dict_type,
624  typename e_value_type,
625  typename bit_score_type>
626 inline void format_sam::write_alignment_record(stream_type & stream,
627  sam_file_output_options const & options,
628  header_type && header,
629  seq_type && seq,
630  qual_type && qual,
631  id_type && id,
632  int32_t const offset,
633  ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
634  ref_id_type && ref_id,
635  std::optional<int32_t> ref_offset,
636  align_type && align,
637  std::vector<cigar> const & cigar_vector,
638  sam_flag const flag,
639  uint8_t const mapq,
640  mate_type && mate,
641  tag_dict_type && tag_dict,
642  e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
643  bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
644 {
645  /* Note the following general things:
646  *
647  * - Given the SAM specifications, all fields may be empty
648  *
649  * - arithmetic values default to 0 while all others default to '*'
650  *
651  * - Because of the former, arithmetic values can be directly streamed
652  * into 'stream' as operator<< is defined for all arithmetic types
653  * and the default value (0) is also the SAM default.
654  *
655  * - All other non-arithmetic values need to be checked for emptiness
656  */
657 
658  // ---------------------------------------------------------------------
659  // Type Requirements (as static asserts for user friendliness)
660  // ---------------------------------------------------------------------
661  static_assert((std::ranges::forward_range<seq_type> &&
662  alphabet<std::ranges::range_reference_t<seq_type>>),
663  "The seq object must be a std::ranges::forward_range over "
664  "letters that model seqan3::alphabet.");
665 
666  static_assert((std::ranges::forward_range<id_type> &&
667  alphabet<std::ranges::range_reference_t<id_type>>),
668  "The id object must be a std::ranges::forward_range over "
669  "letters that model seqan3::alphabet.");
670 
671  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
672  {
673  static_assert((std::ranges::forward_range<ref_id_type> ||
674  std::integral<std::remove_reference_t<ref_id_type>> ||
675  detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
676  "The ref_id object must be a std::ranges::forward_range "
677  "over letters that model seqan3::alphabet.");
678 
679  if constexpr (std::integral<std::remove_cvref_t<ref_id_type>> ||
680  detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>)
681  static_assert(!detail::decays_to_ignore_v<header_type>,
682  "If you give indices as reference id information the header must also be present.");
683  }
684 
686  "The align object must be a std::pair of two ranges whose "
687  "value_type is comparable to seqan3::gap");
688 
689  static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
690  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
691  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
692  "The align object must be a std::pair of two ranges whose "
693  "value_type is comparable to seqan3::gap");
694 
695  static_assert((std::ranges::forward_range<qual_type> &&
696  alphabet<std::ranges::range_reference_t<qual_type>>),
697  "The qual object must be a std::ranges::forward_range "
698  "over letters that model seqan3::alphabet.");
699 
701  "The mate object must be a std::tuple of size 3 with "
702  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
703  "2) a std::integral or std::optional<std::integral>, and "
704  "3) a std::integral.");
705 
706  static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
707  std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
708  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
709  (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
710  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
711  std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
712  "The mate object must be a std::tuple of size 3 with "
713  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
714  "2) a std::integral or std::optional<std::integral>, and "
715  "3) a std::integral.");
716 
717  if constexpr (std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
718  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>)
719  static_assert(!detail::decays_to_ignore_v<header_type>,
720  "If you give indices as mate reference id information the header must also be present.");
721 
722  static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
723  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
724 
725  // ---------------------------------------------------------------------
726  // logical Requirements
727  // ---------------------------------------------------------------------
728  if constexpr (!detail::decays_to_ignore_v<header_type> &&
729  !detail::decays_to_ignore_v<ref_id_type> &&
730  !std::integral<std::remove_reference_t<ref_id_type>> &&
731  !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
732  {
733 
734  if (options.sam_require_header && !std::ranges::empty(ref_id))
735  {
736  auto id_it = header.ref_dict.end();
737 
738  if constexpr (std::ranges::contiguous_range<decltype(ref_id)> &&
739  std::ranges::sized_range<decltype(ref_id)> &&
740  std::ranges::borrowed_range<decltype(ref_id)>)
741  {
742  id_it = header.ref_dict.find(std::span{std::ranges::data(ref_id), std::ranges::size(ref_id)});
743  }
744  else
745  {
746  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
747 
749  "The ref_id type is not convertible to the reference id information stored in the "
750  "reference dictionary of the header object.");
751 
752  id_it = header.ref_dict.find(ref_id);
753  }
754 
755  if (id_it == header.ref_dict.end()) // no reference id matched
756  throw format_error{detail::to_string("The ref_id '", ref_id, "' was not in the list of references:",
757  header.ref_ids())};
758  }
759  }
760 
761  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
762  throw format_error{"The ref_offset object must be a std::integral >= 0."};
763 
764  // ---------------------------------------------------------------------
765  // Writing the Header on first call
766  // ---------------------------------------------------------------------
767  if constexpr (!detail::decays_to_ignore_v<header_type>)
768  {
769  if (options.sam_require_header && !header_was_written)
770  {
771  write_header(stream, options, header);
772  header_was_written = true;
773  }
774  }
775 
776  // ---------------------------------------------------------------------
777  // Writing the Record
778  // ---------------------------------------------------------------------
779 
780  detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
781  constexpr char separator{'\t'};
782 
783  write_range_or_asterisk(stream_it, id);
784  *stream_it = separator;
785 
786  stream_it.write_number(static_cast<uint16_t>(flag));
787  *stream_it = separator;
788 
789  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
790  {
791  if constexpr (std::integral<std::remove_reference_t<ref_id_type>>)
792  {
793  write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id]);
794  }
795  else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
796  {
797  if (ref_id.has_value())
798  write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id.value()]);
799  else
800  *stream_it = '*';
801  }
802  else
803  {
804  write_range_or_asterisk(stream_it, ref_id);
805  }
806  }
807  else
808  {
809  *stream_it = '*';
810  }
811 
812  *stream_it = separator;
813 
814  // SAM is 1 based, 0 indicates unmapped read if optional is not set
815  stream_it.write_number(ref_offset.value_or(-1) + 1);
816  *stream_it = separator;
817 
818  stream_it.write_number(static_cast<unsigned>(mapq));
819  *stream_it = separator;
820 
821  if (!std::ranges::empty(cigar_vector))
822  {
823  for (auto & c : cigar_vector) //TODO THIS IS PROBABLY TERRIBLE PERFORMANCE_WISE
824  stream_it.write_range(c.to_string());
825  }
826  else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
827  {
828  // compute possible distance from alignment end to sequence end
829  // which indicates soft clipping at the end.
830  // This should be replace by a free count_gaps function for
831  // aligned sequences which is more efficient if possible.
832  size_t off_end{std::ranges::size(seq) - offset};
833  for (auto chr : get<1>(align))
834  if (chr == gap{})
835  ++off_end;
836 
837  // Might happen if get<1>(align) doesn't correspond to the reference.
838  assert(off_end >= std::ranges::size(get<1>(align)));
839  off_end -= std::ranges::size(get<1>(align));
840 
841  write_range_or_asterisk(stream_it, detail::get_cigar_string(align, offset, off_end));
842  }
843  else
844  {
845  *stream_it = '*';
846  }
847 
848  *stream_it = separator;
849 
850  if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(mate))>>)
851  {
852  write_range_or_asterisk(stream_it, (header.ref_ids())[get<0>(mate)]);
853  }
854  else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(mate))>, std::optional>)
855  {
856  if (get<0>(mate).has_value())
857  // value_or(0) instead of value() (which is equivalent here) as a
858  // workaround for a ubsan false-positive in GCC8: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90058
859  write_range_or_asterisk(stream_it, header.ref_ids()[get<0>(mate).value_or(0)]);
860  else
861  *stream_it = '*';
862  }
863  else
864  {
865  write_range_or_asterisk(stream_it, get<0>(mate));
866  }
867 
868  *stream_it = separator;
869 
870  if constexpr (detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(get<1>(mate))>, std::optional>)
871  {
872  // SAM is 1 based, 0 indicates unmapped read if optional is not set
873  stream_it.write_number(get<1>(mate).value_or(-1) + 1);
874  *stream_it = separator;
875  }
876  else
877  {
878  stream_it.write_number(get<1>(mate));
879  *stream_it = separator;
880  }
881 
882  stream_it.write_number(get<2>(mate));
883  *stream_it = separator;
884 
885  write_range_or_asterisk(stream_it, seq);
886  *stream_it = separator;
887 
888  write_range_or_asterisk(stream_it, qual);
889 
890  write_tag_fields(stream_it, tag_dict, separator);
891 
892  stream_it.write_end_of_line(options.add_carriage_return);
893 }
894 
895 
913 template <typename stream_view_type, typename value_type>
914 inline void format_sam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
915  stream_view_type && stream_view,
916  value_type value)
917 {
918  std::vector<value_type> tmp_vector;
919  while (std::ranges::begin(stream_view) != ranges::end(stream_view)) // not fully consumed yet
920  {
921  read_field(stream_view | detail::take_until(is_char<','>), value);
922  tmp_vector.push_back(value);
923 
924  if (is_char<','>(*std::ranges::begin(stream_view)))
925  std::ranges::next(std::ranges::begin(stream_view)); // skip ','
926  }
927  variant = std::move(tmp_vector);
928 }
929 
943 template <typename stream_view_type>
944 inline void format_sam::read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant,
945  stream_view_type && stream_view)
946 {
947  std::vector<std::byte> tmp_vector;
948  std::byte value;
949 
950  while (std::ranges::begin(stream_view) != ranges::end(stream_view)) // not fully consumed yet
951  {
952  try
953  {
954  read_field(stream_view | detail::take_exactly_or_throw(2), value);
955  }
956  catch (std::exception const & e)
957  {
958  throw format_error{"Hexadecimal tag has an uneven number of digits!"};
959  }
960 
961  tmp_vector.push_back(value);
962  }
963 
964  variant = std::move(tmp_vector);
965 }
966 
984 template <typename stream_view_type>
985 inline void format_sam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
986 {
987  /* Every SAM tag has the format "[TAG]:[TYPE_ID]:[VALUE]", where TAG is a two letter
988  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
989  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of comma separated
990  VALUE's and the inner value type is identified by the character following ':', one of [cCsSiIf].
991  */
992  uint16_t tag = static_cast<uint16_t>(*std::ranges::begin(stream_view)) << 8;
993  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
994  tag += static_cast<uint16_t>(*std::ranges::begin(stream_view));
995  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
996  std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
997  char type_id = *std::ranges::begin(stream_view);
998  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
999  std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
1000 
1001  switch (type_id)
1002  {
1003  case 'A' : // char
1004  {
1005  target[tag] = static_cast<char>(*std::ranges::begin(stream_view));
1006  std::ranges::next(std::ranges::begin(stream_view)); // skip char that has been read
1007  break;
1008  }
1009  case 'i' : // int32_t
1010  {
1011  int32_t tmp;
1012  read_field(stream_view, tmp);
1013  target[tag] = tmp;
1014  break;
1015  }
1016  case 'f' : // float
1017  {
1018  float tmp;
1019  read_field(stream_view, tmp);
1020  target[tag] = tmp;
1021  break;
1022  }
1023  case 'Z' : // string
1024  {
1025  target[tag] = stream_view | views::to<std::string>;
1026  break;
1027  }
1028  case 'H' :
1029  {
1030  read_sam_byte_vector(target[tag], stream_view);
1031  break;
1032  }
1033  case 'B' : // Array. Value type depends on second char [cCsSiIf]
1034  {
1035  char array_value_type_id = *std::ranges::begin(stream_view);
1036  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1037  std::ranges::next(std::ranges::begin(stream_view)); // skip first ','
1038 
1039  switch (array_value_type_id)
1040  {
1041  case 'c' : // int8_t
1042  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1043  break;
1044  case 'C' : // uint8_t
1045  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1046  break;
1047  case 's' : // int16_t
1048  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1049  break;
1050  case 'S' : // uint16_t
1051  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1052  break;
1053  case 'i' : // int32_t
1054  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1055  break;
1056  case 'I' : // uint32_t
1057  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1058  break;
1059  case 'f' : // float
1060  read_sam_dict_vector(target[tag], stream_view, float{});
1061  break;
1062  default:
1063  throw format_error{std::string("The first character in the numerical ") +
1064  "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id +
1065  "' was given."};
1066  }
1067  break;
1068  }
1069  default:
1070  throw format_error{std::string("The second character in the numerical id of a "
1071  "SAM tag must be one of [A,i,Z,H,B,f] but '") + type_id + "' was given."};
1072  }
1073 }
1074 
1082 template <typename stream_it_t, std::ranges::forward_range field_type>
1083 inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value)
1084 {
1085  if (std::ranges::empty(field_value))
1086  {
1087  *stream_it = '*';
1088  }
1089  else
1090  {
1091  if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<field_type>>, char>)
1092  stream_it.write_range(field_value);
1093  else // convert from alphabets to their character representation
1094  stream_it.write_range(field_value | views::to_char);
1095  }
1096 }
1097 
1104 template <typename stream_it_t>
1105 inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value)
1106 {
1107  write_range_or_asterisk(stream_it, std::string_view{field_value});
1108 }
1109 
1117 template <typename stream_it_t>
1118 inline void format_sam::write_tag_fields(stream_it_t & stream_it, sam_tag_dictionary const & tag_dict, char const separator)
1119 {
1120  auto const stream_variant_fn = [&stream_it] (auto && arg) // helper to print a std::variant
1121  {
1122  using T = std::remove_cvref_t<decltype(arg)>;
1123 
1124  if constexpr (std::ranges::input_range<T>)
1125  {
1126  if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, char>)
1127  {
1128  stream_it.write_range(arg);
1129  }
1130  else if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, std::byte>)
1131  {
1132  if (!std::ranges::empty(arg))
1133  {
1134  stream_it.write_number(std::to_integer<uint8_t>(*std::ranges::begin(arg)));
1135 
1136  for (auto && elem : arg | std::views::drop(1))
1137  {
1138  *stream_it = ',';
1139  stream_it.write_number(std::to_integer<uint8_t>(elem));
1140  }
1141  }
1142  }
1143  else
1144  {
1145  if (!std::ranges::empty(arg))
1146  {
1147  stream_it.write_number(*std::ranges::begin(arg));
1148 
1149  for (auto && elem : arg | std::views::drop(1))
1150  {
1151  *stream_it = ',';
1152  stream_it.write_number(elem);
1153  }
1154  }
1155  }
1156  }
1157  else if constexpr (std::same_as<std::remove_cvref_t<T>, char>)
1158  {
1159  *stream_it = arg;
1160  }
1161  else // number
1162  {
1163  stream_it.write_number(arg);
1164  }
1165  };
1166 
1167  for (auto & [tag, variant] : tag_dict)
1168  {
1169  *stream_it = separator;
1170 
1171  char const char0 = tag / 256;
1172  char const char1 = tag % 256;
1173 
1174  *stream_it = char0;
1175  *stream_it = char1;
1176  *stream_it = ':';
1177  *stream_it = detail::sam_tag_type_char[variant.index()];
1178  *stream_it = ':';
1179 
1180  if (detail::sam_tag_type_char_extra[variant.index()] != '\0')
1181  {
1182  *stream_it = detail::sam_tag_type_char_extra[variant.index()];
1183  *stream_it = ',';
1184  }
1185 
1186  std::visit(stream_variant_fn, variant);
1187  }
1188 }
1189 
1190 } // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
The SAM format (tag).
Definition: format_sam.hpp:117
format_sam & operator=(format_sam const &)=default
Defaulted.
~format_sam()=default
Defaulted.
format_sam & operator=(format_sam &&)=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:367
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_sam.hpp:134
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_sam.hpp:316
format_sam(format_sam &&)=default
Defaulted.
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:286
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, std::vector< cigar > const &cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score)
Write the given fields to the specified stream.
Definition: format_sam.hpp:626
format_sam()=default
Defaulted.
format_sam(format_sam const &)=default
Defaulted.
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:35
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:160
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:121
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:337
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_char
Return the char representation of an alphabet object.
Definition: concept.hpp:387
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ none
None of the flags below are set.
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
typename remove_cvref< t >::type remove_cvref_t
Return the input type with const, volatile and references removed (transformation_trait shortcut).
Definition: type_traits:73
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:128
typename decltype(detail::split_after< i >(list_t{}))::second_type drop
Return a seqan3::type_list of the types in the input type list, except the first n.
Definition: traits.hpp:388
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>().
A more refined container concept than seqan3::container.
The generic concept for a (biological) sequence.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
T push_back(T... args)
The <ranges> header from C++20's standard library.
Provides seqan3::sam_file_input_format and auxiliary classes.
Provides seqan3::sam_file_output_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
Provides seqan3::views::slice.
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:91
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: exception.hpp:48
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:26
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: output_options.hpp:30
bool sam_require_header
Whether to require a header for SAM files.
Definition: output_options.hpp:44
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:27
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: input_options.hpp:29
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:26
T swap(T... args)
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
Provides seqan3::views::to.
Provides seqan3::views::to_char.
T tuple_size_v
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::tuple_like.
T visit(T... args)