SeqAn3  3.1.0
The Modern C++ library for sequence analysis.
format_vienna.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/algorithm>
16 #include <cstdio>
17 #include <iterator>
18 #include <seqan3/std/ranges>
19 #include <stack>
20 #include <string>
21 #include <string_view>
22 #include <type_traits>
23 #include <vector>
24 
44 
45 namespace seqan3
46 {
86 {
87 public:
91  format_vienna() noexcept = default;
92  format_vienna(format_vienna const &) noexcept = default;
93  format_vienna & operator=(format_vienna const &) noexcept = default;
94  format_vienna(format_vienna &&) noexcept = default;
95  format_vienna & operator=(format_vienna &&) noexcept = default;
96  ~format_vienna() noexcept = default;
97 
99 
101  static inline std::vector<std::string> file_extensions
102  {
103  { "dbn" },
104  { "fasta" },
105  { "fa" }
106  };
107 
108 protected:
110  template <typename stream_type, // constraints checked by file
111  typename seq_legal_alph_type,
112  bool structured_seq_combined,
113  typename seq_type, // other constraints checked inside function
114  typename id_type,
115  typename bpp_type,
116  typename structure_type,
117  typename energy_type,
118  typename react_type,
119  typename comment_type,
120  typename offset_type>
121  void read_structure_record(stream_type & stream,
123  seq_type & seq,
124  id_type & id,
125  bpp_type & bpp,
126  structure_type & structure,
127  energy_type & energy,
128  react_type & SEQAN3_DOXYGEN_ONLY(react),
129  react_type & SEQAN3_DOXYGEN_ONLY(react_err),
130  comment_type & SEQAN3_DOXYGEN_ONLY(comment),
131  offset_type & SEQAN3_DOXYGEN_ONLY(offset))
132  {
133  auto stream_view = detail::istreambuf(stream);
134 
135  // READ ID (if present)
136  auto constexpr is_id = is_char<'>'>;
137  if (is_id(*begin(stream_view)))
138  {
139  if constexpr (!detail::decays_to_ignore_v<id_type>)
140  {
141  if (options.truncate_ids)
142  {
143  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
144  | detail::take_until_or_throw(is_cntrl || is_blank)
145  | views::char_to<std::ranges::range_value_t<id_type>>,
146  std::cpp20::back_inserter(id));
147  detail::consume(stream_view | detail::take_line_or_throw);
148  }
149  else
150  {
151  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
152  | detail::take_line_or_throw
153  | views::char_to<std::ranges::range_value_t<id_type>>,
154  std::cpp20::back_inserter(id));
155  }
156  }
157  else
158  {
159  detail::consume(stream_view | detail::take_line_or_throw);
160  }
161  }
162  else if constexpr (!detail::decays_to_ignore_v<id_type>)
163  {
164  auto constexpr is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
165  if (!is_legal_seq(*begin(stream_view))) // if neither id nor seq found: throw
166  {
167  throw parse_error{std::string{"Expected to be on beginning of ID or sequence, but "} +
168  is_id.msg + " and char_is_valid_for<" +
169  detail::type_name_as_string<seq_legal_alph_type> + ">" +
170  " evaluated to false on " + detail::make_printable(*begin(stream_view))};
171  }
172  }
173 
174  // READ SEQUENCE
175  if constexpr (!detail::decays_to_ignore_v<seq_type>)
176  {
177  auto constexpr is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
178  std::ranges::copy(stream_view | detail::take_line_or_throw // until end of line
179  | std::views::filter(!(is_space || is_digit)) // ignore whitespace and numbers
180  | std::views::transform([is_legal_seq](char const c)
181  {
182  if (!is_legal_seq(c)) // enforce legal alphabet
183  {
184  throw parse_error{std::string{"Encountered an unexpected letter: "} +
185  "char_is_valid_for<" +
186  detail::type_name_as_string<seq_legal_alph_type> +
187  "> evaluated to false on " +
188  detail::make_printable(c)};
189  }
190  return c;
191  })
192  | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
193  std::cpp20::back_inserter(seq));
194  }
195  else
196  {
197  detail::consume(stream_view | detail::take_line_or_throw);
198  }
199 
200  // READ STRUCTURE (if present)
201  [[maybe_unused]] int64_t structure_length{};
202  if constexpr (!detail::decays_to_ignore_v<structure_type>)
203  {
204  if constexpr (structured_seq_combined)
205  {
206  assert(std::addressof(seq) == std::addressof(structure));
207  using alph_type = typename std::ranges::range_value_t<structure_type>::structure_alphabet_type;
208  // We need the structure_length parameter to count the length of the structure while reading
209  // because we cannot infer it from the (already resized) structure_seq object.
210  auto res = std::ranges::copy(read_structure<alph_type>(stream_view), std::ranges::begin(structure));
211  structure_length = std::ranges::distance(std::ranges::begin(structure), res.out);
212 
213  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
214  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
215  }
216  else
217  {
218  using alph_type = std::ranges::range_value_t<structure_type>;
219  std::ranges::copy(read_structure<alph_type>(stream_view), std::cpp20::back_inserter(structure));
220  structure_length = std::ranges::distance(structure);
221 
222  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
223  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
224  }
225  }
226  else if constexpr (!detail::decays_to_ignore_v<bpp_type>)
227  {
228  detail::bpp_from_rna_structure<wuss51>(bpp, read_structure<wuss51>(stream_view));
229  structure_length = std::ranges::distance(bpp);
230  }
231  else
232  {
233  detail::consume(stream_view | detail::take_until(is_space)); // until whitespace
234  }
235 
236  if constexpr (!detail::decays_to_ignore_v<seq_type> &&
237  !(detail::decays_to_ignore_v<structure_type> && detail::decays_to_ignore_v<bpp_type>))
238  {
239  if (std::ranges::distance(seq) != structure_length)
240  throw parse_error{"Found sequence and associated structure of different length."};
241  }
242 
243  // READ ENERGY (if present)
244  if constexpr (!detail::decays_to_ignore_v<energy_type>)
245  {
246  std::string e_str = stream_view | detail::take_line
247  | std::views::filter(!(is_space || is_char<'('> || is_char<')'>))
248  | views::to<std::string>;
249  if (!e_str.empty())
250  {
251  size_t num_processed;
252  energy = std::stod(e_str, &num_processed);
253  if (num_processed != e_str.size()) // [[unlikely]]
254  {
255  throw parse_error{std::string{"Failed to parse energy value '"} + e_str + "'."};
256  }
257  }
258  }
259  else
260  {
261  detail::consume(stream_view | detail::take_line);
262  }
263  detail::consume(stream_view | detail::take_until(!is_space));
264  }
265 
267  template <typename stream_type, // constraints checked by file
268  typename seq_type, // other constraints checked inside function
269  typename id_type,
270  typename bpp_type,
271  typename structure_type,
272  typename energy_type,
273  typename react_type,
274  typename comment_type,
275  typename offset_type>
276  void write_structure_record(stream_type & stream,
277  structure_file_output_options const & options,
278  seq_type && seq,
279  id_type && id,
280  bpp_type && SEQAN3_DOXYGEN_ONLY(bpp),
281  structure_type && structure,
282  energy_type && energy,
283  react_type && SEQAN3_DOXYGEN_ONLY(react),
284  react_type && SEQAN3_DOXYGEN_ONLY(react_err),
285  comment_type && SEQAN3_DOXYGEN_ONLY(comment),
286  offset_type && SEQAN3_DOXYGEN_ONLY(offset))
287  {
288  std::cpp20::ostreambuf_iterator stream_it{stream};
289 
290  // WRITE ID (optional)
291  if constexpr (!detail::decays_to_ignore_v<id_type>)
292  {
293  if (!std::ranges::empty(id))
294  {
295  stream_it = '>';
296  stream_it = ' ';
297  std::ranges::copy(id, stream_it);
298  detail::write_eol(stream_it, options.add_carriage_return);
299  }
300  }
301 
302  // WRITE SEQUENCE
303  if constexpr (!detail::decays_to_ignore_v<seq_type>)
304  {
305  if (std::ranges::empty(seq)) //[[unlikely]]
306  throw std::runtime_error{"The SEQ field may not be empty when writing Vienna files."};
307 
308  std::ranges::copy(seq | views::to_char, stream_it);
309  detail::write_eol(stream_it, options.add_carriage_return);
310  }
311  else
312  {
313  throw std::logic_error{"The SEQ and STRUCTURED_SEQ fields may not both be set to ignore "
314  "when writing Vienna files."};
315  }
316 
317  // WRITE STRUCTURE (optional)
318  if constexpr (!detail::decays_to_ignore_v<structure_type>)
319  {
320  if (!std::ranges::empty(structure))
321  std::ranges::copy(structure | views::to_char, stream_it);
322 
323  // WRITE ENERGY (optional)
324  if constexpr (!detail::decays_to_ignore_v<energy_type>)
325  {
326  if (energy)
327  {
328 // TODO(joergi-w) enable the following when std::to_chars is implemented for float types
329 // auto [endptr, ec] = std::to_chars(str.data(),
330 // str.data() + str.size(),
331 // energy,
332 // std::chars_format::fixed,
333 // options.precision);
334 // if (ec == std::errc())
335 // std::ranges::copy(str.data(), endptr, stream_it);
336 // else
337 // throw std::runtime_error{"The energy could not be transformed into a string."};
338 
339  stream_it = ' ';
340  stream_it = '(';
341 
342  std::array<char, 100> str;
343  int len = std::snprintf(str.data(), 100, "%.*f", options.precision, energy);
344  if (len < 0 || len >= 100)
345  throw std::runtime_error{"The energy could not be transformed into a string."};
346  std::ranges::copy(str.data(), str.data() + len, stream_it);
347 
348  stream_it = ')';
349  }
350  }
351  detail::write_eol(stream_it, options.add_carriage_return);
352  }
353  else if constexpr (!detail::decays_to_ignore_v<energy_type>)
354  {
355  throw std::logic_error{"The ENERGY field cannot be written to a Vienna file without providing STRUCTURE."};
356  }
357  }
358 
359 private:
366  template <typename alph_type, typename stream_view_type>
367  auto read_structure(stream_view_type & stream_view)
368  {
369  auto constexpr is_legal_structure = char_is_valid_for<alph_type>;
370  return stream_view | detail::take_until(is_space) // until whitespace
371  | std::views::transform([is_legal_structure](char const c)
372  {
373  if (!is_legal_structure(c))
374  {
375  throw parse_error{
376  std::string{"Encountered an unexpected letter: char_is_valid_for<"} +
377  detail::type_name_as_string<alph_type> +
378  "> evaluated to false on " + detail::make_printable(c)};
379  }
380  return c;
381  }) // enforce legal alphabet
382  | views::char_to<alph_type>; // convert to actual target alphabet
383  }
384 };
385 
386 } // namespace seqan3::detail
The <algorithm> header from C++20's standard library.
Core alphabet concept and free function/type trait wrappers.
Provides alphabet adaptations for standard char types.
Provides seqan3::views::char_to.
The Vienna format (dot bracket notation) for RNA sequences with secondary structure.
Definition: format_vienna.hpp:86
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_vienna.hpp:102
format_vienna() noexcept=default
Defaulted.
void read_structure_record(stream_type &stream, structure_file_input_options< seq_legal_alph_type, structured_seq_combined > const &options, seq_type &seq, id_type &id, bpp_type &bpp, structure_type &structure, energy_type &energy, react_type &react, react_type &react_err, comment_type &comment, offset_type &offset)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_vienna.hpp:121
Provides various utility functions.
Provides various transformation traits used by the range module.
Provides various utility functions.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Provides character predicates for tokenisation.
The <ranges> header from C++20's standard library.
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:30
Provides seqan3::structure_file_input_format.
Provides seqan3::structure_file_input_options.
Provides seqan3::structure_file_output_format and auxiliary classes.
Provides seqan3::structure_file_output_options.
Provides seqan3::detail::take_line and seqan3::detail::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::views::to.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides the WUSS format for RNA structure.