SeqAn3  3.2.0
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <bit>
16 #include <iterator>
17 #include <ranges>
18 #include <string>
19 #include <vector>
20 
33 
34 namespace seqan3
35 {
36 
49 class format_bam : private detail::format_sam_base
50 {
51 public:
55  // string_buffer is of type std::string and has some problems with pre-C++11 ABI
56  format_bam() = default;
57  format_bam(format_bam const &) = default;
58  format_bam & operator=(format_bam const &) = default;
59  format_bam(format_bam &&) = default;
60  format_bam & operator=(format_bam &&) = default;
61  ~format_bam() = default;
62 
64 
66  static inline std::vector<std::string> file_extensions{{"bam"}};
67 
68 protected:
69  template <typename stream_type, // constraints checked by file
70  typename seq_legal_alph_type,
71  typename ref_seqs_type,
72  typename ref_ids_type,
73  typename stream_pos_type,
74  typename seq_type,
75  typename id_type,
76  typename offset_type,
77  typename ref_seq_type,
78  typename ref_id_type,
79  typename ref_offset_type,
80  typename align_type,
81  typename cigar_type,
82  typename flag_type,
83  typename mapq_type,
84  typename qual_type,
85  typename mate_type,
86  typename tag_dict_type,
87  typename e_value_type,
88  typename bit_score_type>
89  void read_alignment_record(stream_type & stream,
90  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
91  ref_seqs_type & ref_seqs,
93  stream_pos_type & position_buffer,
94  seq_type & seq,
95  qual_type & qual,
96  id_type & id,
97  offset_type & offset,
98  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
99  ref_id_type & ref_id,
100  ref_offset_type & ref_offset,
101  align_type & align,
102  cigar_type & cigar_vector,
103  flag_type & flag,
104  mapq_type & mapq,
105  mate_type & mate,
106  tag_dict_type & tag_dict,
107  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
108  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
109 
110  template <typename stream_type,
111  typename header_type,
112  typename seq_type,
113  typename id_type,
114  typename ref_seq_type,
115  typename ref_id_type,
116  typename align_type,
117  typename cigar_type,
118  typename qual_type,
119  typename mate_type,
120  typename tag_dict_type>
121  void write_alignment_record([[maybe_unused]] stream_type & stream,
122  [[maybe_unused]] sam_file_output_options const & options,
123  [[maybe_unused]] header_type && header,
124  [[maybe_unused]] seq_type && seq,
125  [[maybe_unused]] qual_type && qual,
126  [[maybe_unused]] id_type && id,
127  [[maybe_unused]] int32_t const offset,
128  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
129  [[maybe_unused]] ref_id_type && ref_id,
130  [[maybe_unused]] std::optional<int32_t> ref_offset,
131  [[maybe_unused]] align_type && align,
132  [[maybe_unused]] cigar_type && cigar_vector,
133  [[maybe_unused]] sam_flag const flag,
134  [[maybe_unused]] uint8_t const mapq,
135  [[maybe_unused]] mate_type && mate,
136  [[maybe_unused]] tag_dict_type && tag_dict,
137  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
138  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
139 
140 private:
142  bool header_was_read{false};
143 
145  std::string string_buffer{};
146 
148  struct alignment_record_core
149  { // naming corresponds to official SAM/BAM specifications
150  int32_t block_size;
151  int32_t refID;
152  int32_t pos;
153  uint32_t l_read_name : 8;
154  uint32_t mapq : 8;
155  uint32_t bin : 16;
156  uint32_t n_cigar_op : 16;
157  sam_flag flag;
158  int32_t l_seq;
159  int32_t next_refID;
160  int32_t next_pos;
161  int32_t tlen;
162  };
163 
165  static constexpr std::array<uint8_t, 256> char_to_sam_rank{[]() constexpr {std::array<uint8_t, 256> ret{};
166 
167  using index_t = std::make_unsigned_t<char>;
168 
169  // ret['M'] = 0; set anyway by initialization
170  ret[static_cast<index_t>('I')] = 1;
171  ret[static_cast<index_t>('D')] = 2;
172  ret[static_cast<index_t>('N')] = 3;
173  ret[static_cast<index_t>('S')] = 4;
174  ret[static_cast<index_t>('H')] = 5;
175  ret[static_cast<index_t>('P')] = 6;
176  ret[static_cast<index_t>('=')] = 7;
177  ret[static_cast<index_t>('X')] = 8;
178 
179  return ret;
180 }()
181 }; // namespace seqan3
182 
184 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
185 {
186  --end;
187  if (beg >> 14 == end >> 14)
188  return ((1 << 15) - 1) / 7 + (beg >> 14);
189  if (beg >> 17 == end >> 17)
190  return ((1 << 12) - 1) / 7 + (beg >> 17);
191  if (beg >> 20 == end >> 20)
192  return ((1 << 9) - 1) / 7 + (beg >> 20);
193  if (beg >> 23 == end >> 23)
194  return ((1 << 6) - 1) / 7 + (beg >> 23);
195  if (beg >> 26 == end >> 26)
196  return ((1 << 3) - 1) / 7 + (beg >> 26);
197  return 0;
198 }
199 
206 template <typename stream_view_type, std::integral number_type>
207 void read_integral_byte_field(stream_view_type && stream_view, number_type & target)
208 {
209  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
210 }
211 
217 template <typename stream_view_type>
218 void read_float_byte_field(stream_view_type && stream_view, float & target)
219 {
220  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
221 }
222 
223 template <typename stream_view_type, typename value_type>
224 void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
225  stream_view_type && stream_view,
226  value_type const & SEQAN3_DOXYGEN_ONLY(value));
227 
228 template <typename stream_view_type>
229 void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
230 
231 template <typename cigar_input_type>
232 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
233 
234 static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
235 }
236 ;
237 
239 template <typename stream_type, // constraints checked by file
240  typename seq_legal_alph_type,
241  typename ref_seqs_type,
242  typename ref_ids_type,
243  typename stream_pos_type,
244  typename seq_type,
245  typename id_type,
246  typename offset_type,
247  typename ref_seq_type,
248  typename ref_id_type,
249  typename ref_offset_type,
250  typename align_type,
251  typename cigar_type,
252  typename flag_type,
253  typename mapq_type,
254  typename qual_type,
255  typename mate_type,
256  typename tag_dict_type,
257  typename e_value_type,
258  typename bit_score_type>
259 inline void
261  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
262  ref_seqs_type & ref_seqs,
264  stream_pos_type & position_buffer,
265  seq_type & seq,
266  qual_type & qual,
267  id_type & id,
268  offset_type & offset,
269  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
270  ref_id_type & ref_id,
271  ref_offset_type & ref_offset,
272  align_type & align,
273  cigar_type & cigar_vector,
274  flag_type & flag,
275  mapq_type & mapq,
276  mate_type & mate,
277  tag_dict_type & tag_dict,
278  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
279  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
280 {
281  static_assert(detail::decays_to_ignore_v<ref_offset_type>
282  || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
283  "The ref_offset must be a specialisation of std::optional.");
284 
285  static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
286  "The type of field::mapq must be uint8_t.");
287 
288  static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
289  "The type of field::flag must be seqan3::sam_flag.");
290 
291  auto stream_view = seqan3::detail::istreambuf(stream);
292 
293  // these variables need to be stored to compute the ALIGNMENT
294  [[maybe_unused]] int32_t offset_tmp{};
295  [[maybe_unused]] int32_t soft_clipping_end{};
296  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
297  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
298 
299  // Header
300  // -------------------------------------------------------------------------------------------------------------
301  if (!header_was_read)
302  {
303  // magic BAM string
304  if (!std::ranges::equal(stream_view | detail::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
305  throw format_error{"File is not in BAM format."};
306 
307  int32_t l_text{}; // length of header text including \0 character
308  int32_t n_ref{}; // number of reference sequences
309  int32_t l_name{}; // 1 + length of reference name including \0 character
310  int32_t l_ref{}; // length of reference sequence
311 
312  read_integral_byte_field(stream_view, l_text);
313 
314  if (l_text > 0) // header text is present
315  read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
316 
317  read_integral_byte_field(stream_view, n_ref);
318 
319  for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
320  {
321  read_integral_byte_field(stream_view, l_name);
322 
323  string_buffer.resize(l_name - 1);
324  std::ranges::copy_n(std::ranges::begin(stream_view),
325  l_name - 1,
326  string_buffer.data()); // copy without \0 character
327  ++std::ranges::begin(stream_view); // skip \0 character
328 
329  read_integral_byte_field(stream_view, l_ref);
330 
331  if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
332  {
333  // If there was no header text, we parse reference sequences block as header information
334  if (l_text == 0)
335  {
336  auto & reference_ids = header.ref_ids();
337  // put the length of the reference sequence into ref_id_info
338  header.ref_id_info.emplace_back(l_ref, "");
339  // put the reference name into reference_ids
340  reference_ids.push_back(string_buffer);
341  // assign the reference name an ascending reference id (starts at index 0).
342  header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
343  continue;
344  }
345  }
346 
347  auto id_it = header.ref_dict.find(string_buffer);
348 
349  // sanity checks of reference information to existing header object:
350  if (id_it == header.ref_dict.end()) // [unlikely]
351  {
352  throw format_error{detail::to_string("Unknown reference name '" + string_buffer
353  + "' found in BAM file header (header.ref_ids():",
354  header.ref_ids(),
355  ").")};
356  }
357  else if (id_it->second != ref_idx) // [unlikely]
358  {
359  throw format_error{detail::to_string("Reference id '",
360  string_buffer,
361  "' at position ",
362  ref_idx,
363  " does not correspond to the position ",
364  id_it->second,
365  " in the header (header.ref_ids():",
366  header.ref_ids(),
367  ").")};
368  }
369  else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
370  {
371  throw format_error{"Provided reference has unequal length as specified in the header."};
372  }
373  }
374 
375  header_was_read = true;
376 
377  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
378  return;
379  }
380 
381  // read alignment record into buffer
382  // -------------------------------------------------------------------------------------------------------------
383  position_buffer = stream.tellg();
384  alignment_record_core core;
385  std::ranges::copy(stream_view | detail::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
386 
387  if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
388  {
389  throw format_error{detail::to_string("Reference id index '",
390  core.refID,
391  "' is not in range of ",
392  "header.ref_ids(), which has size ",
393  header.ref_ids().size(),
394  ".")};
395  }
396  else if (core.refID > -1) // not unmapped
397  {
398  ref_id = core.refID; // field::ref_id
399  }
400 
401  flag = core.flag; // field::flag
402  mapq = core.mapq; // field::mapq
403 
404  if (core.pos > -1) // [[likely]]
405  ref_offset = core.pos; // field::ref_offset
406 
407  if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
408  {
409  if (core.next_refID > -1)
410  get<0>(mate) = core.next_refID;
411 
412  if (core.next_pos > -1) // [[likely]]
413  get<1>(mate) = core.next_pos;
414 
415  get<2>(mate) = core.tlen;
416  }
417 
418  // read id
419  // -------------------------------------------------------------------------------------------------------------
420  auto id_view = stream_view | detail::take_exactly_or_throw(core.l_read_name - 1);
421  if constexpr (!detail::decays_to_ignore_v<id_type>)
422  read_forward_range_field(id_view, id); // field::id
423  else
424  detail::consume(id_view);
425  ++std::ranges::begin(stream_view); // skip '\0'
426 
427  // read cigar string
428  // -------------------------------------------------------------------------------------------------------------
429  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
430  {
431  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
432  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
433  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
434  }
435  else
436  {
437  detail::consume(stream_view | detail::take_exactly_or_throw(core.n_cigar_op * 4));
438  }
439 
440  offset = offset_tmp;
441 
442  // read sequence
443  // -------------------------------------------------------------------------------------------------------------
444  if (core.l_seq > 0) // sequence information is given
445  {
446  auto seq_stream = stream_view | detail::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
448  [](char c) -> std::pair<dna16sam, dna16sam>
449  {
450  return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
451  dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
452  });
453 
454  if constexpr (detail::decays_to_ignore_v<seq_type>)
455  {
456  auto skip_sequence_bytes = [&]()
457  {
458  detail::consume(seq_stream);
459  if (core.l_seq & 1)
460  ++std::ranges::begin(stream_view);
461  };
462 
463  if constexpr (!detail::decays_to_ignore_v<align_type>)
464  {
465  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
466  "If you want to read ALIGNMENT but not SEQ, the alignment"
467  " object must store a sequence container at the second (query) position.");
468 
469  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
470  {
471  assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
472  using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
473  constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
474 
475  get<1>(align).reserve(seq_length);
476 
477  auto tmp_iter = std::ranges::begin(seq_stream);
478  std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
479 
480  if (offset_tmp & 1)
481  {
482  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
483  ++tmp_iter;
484  --seq_length;
485  }
486 
487  for (size_t i = (seq_length / 2); i > 0; --i)
488  {
489  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
490  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
491  ++tmp_iter;
492  }
493 
494  if (seq_length & 1)
495  {
496  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
497  ++tmp_iter;
498  --soft_clipping_end;
499  }
500 
501  std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
502  }
503  else
504  {
505  skip_sequence_bytes();
506  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
507  }
508  }
509  else
510  {
511  skip_sequence_bytes();
512  }
513  }
514  else
515  {
516  using alph_t = std::ranges::range_value_t<decltype(seq)>;
517  constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
518 
519  for (auto [d1, d2] : seq_stream)
520  {
521  seq.push_back(from_dna16[to_rank(d1)]);
522  seq.push_back(from_dna16[to_rank(d2)]);
523  }
524 
525  if (core.l_seq & 1)
526  {
527  dna16sam d =
528  dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
529  seq.push_back(from_dna16[to_rank(d)]);
530  ++std::ranges::begin(stream_view);
531  }
532 
533  if constexpr (!detail::decays_to_ignore_v<align_type>)
534  {
535  assign_unaligned(get<1>(align),
536  seq
537  | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
538  std::ranges::distance(seq) - soft_clipping_end));
539  }
540  }
541  }
542 
543  // read qual string
544  // -------------------------------------------------------------------------------------------------------------
545  auto qual_view = stream_view | detail::take_exactly_or_throw(core.l_seq)
547  [](char chr)
548  {
549  return static_cast<char>(chr + 33);
550  });
551  if constexpr (!detail::decays_to_ignore_v<qual_type>)
552  read_forward_range_field(qual_view, qual);
553  else
554  detail::consume(qual_view);
555 
556  // All remaining optional fields if any: SAM tags dictionary
557  // -------------------------------------------------------------------------------------------------------------
558  int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4 /*block_size excluded*/)
559  - core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
560  assert(remaining_bytes >= 0);
561  auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
562 
563  while (tags_view.size() > 0)
564  {
565  if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
566  read_sam_dict_field(tags_view, tag_dict);
567  else
568  detail::consume(tags_view);
569  }
570 
571  // DONE READING - wrap up
572  // -------------------------------------------------------------------------------------------------------------
573  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
574  {
575  // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
576  // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
577  // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
578  if (core.l_seq != 0 && offset_tmp == core.l_seq)
579  {
580  if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
581  { // maybe only throw in debug mode and otherwise return an empty alignment?
582  throw format_error{
583  detail::to_string("The cigar string '",
584  offset_tmp,
585  "S",
586  ref_length,
587  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
588  "stored in the optional field CG. You need to read in the field::tags and "
589  "field::seq in order to access this information.")};
590  }
591  else
592  {
593  auto it = tag_dict.find("CG"_tag);
594 
595  if (it == tag_dict.end())
596  throw format_error{detail::to_string(
597  "The cigar string '",
598  offset_tmp,
599  "S",
600  ref_length,
601  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
602  "stored in the optional field CG but this tag is not present in the given ",
603  "record.")};
604 
605  auto cigar_view = std::views::all(std::get<std::string>(it->second));
606  std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
607  offset_tmp = soft_clipping_end = 0;
608  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
609  tag_dict.erase(it); // remove redundant information
610 
611  if constexpr (!detail::decays_to_ignore_v<align_type>)
612  {
613  assign_unaligned(
614  get<1>(align),
615  seq
616  | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
617  std::ranges::distance(seq) - soft_clipping_end));
618  }
619  }
620  }
621  }
622 
623  // Alignment object construction
624  if constexpr (!detail::decays_to_ignore_v<align_type>)
625  construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM
626 
627  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
628  std::swap(cigar_vector, tmp_cigar_vector);
629 }
630 
632 template <typename stream_type,
633  typename header_type,
634  typename seq_type,
635  typename id_type,
636  typename ref_seq_type,
637  typename ref_id_type,
638  typename align_type,
639  typename cigar_type,
640  typename qual_type,
641  typename mate_type,
642  typename tag_dict_type>
643 inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
644  [[maybe_unused]] sam_file_output_options const & options,
645  [[maybe_unused]] header_type && header,
646  [[maybe_unused]] seq_type && seq,
647  [[maybe_unused]] qual_type && qual,
648  [[maybe_unused]] id_type && id,
649  [[maybe_unused]] int32_t const offset,
650  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
651  [[maybe_unused]] ref_id_type && ref_id,
652  [[maybe_unused]] std::optional<int32_t> ref_offset,
653  [[maybe_unused]] align_type && align,
654  [[maybe_unused]] cigar_type && cigar_vector,
655  [[maybe_unused]] sam_flag const flag,
656  [[maybe_unused]] uint8_t const mapq,
657  [[maybe_unused]] mate_type && mate,
658  [[maybe_unused]] tag_dict_type && tag_dict,
659  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
660  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
661 {
662  // ---------------------------------------------------------------------
663  // Type Requirements (as static asserts for user friendliness)
664  // ---------------------------------------------------------------------
665  static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
666  "The seq object must be a std::ranges::forward_range over "
667  "letters that model seqan3::alphabet.");
668 
669  static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
670  "The id object must be a std::ranges::forward_range over "
671  "letters that model seqan3::alphabet.");
672 
673  static_assert((std::ranges::forward_range<ref_seq_type> && alphabet<std::ranges::range_reference_t<ref_seq_type>>),
674  "The ref_seq object must be a std::ranges::forward_range "
675  "over letters that model seqan3::alphabet.");
676 
677  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
678  {
679  static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
680  || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
681  "The ref_id object must be a std::ranges::forward_range "
682  "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
683  }
684 
685  static_assert(tuple_like<std::remove_cvref_t<align_type>>,
686  "The align object must be a std::pair of two ranges whose "
687  "value_type is comparable to seqan3::gap");
688 
689  static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2
690  && std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>>
691  && std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
692  "The align object must be a std::pair of two ranges whose "
693  "value_type is comparable to seqan3::gap");
694 
695  static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
696  "The qual object must be a std::ranges::forward_range "
697  "over letters that model seqan3::alphabet.");
698 
699  static_assert(tuple_like<std::remove_cvref_t<mate_type>>,
700  "The mate object must be a std::tuple of size 3 with "
701  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
702  "2) a std::integral or std::optional<std::integral>, and "
703  "3) a std::integral.");
704 
705  static_assert(
706  ((std::ranges::forward_range<decltype(std::get<0>(mate))>
707  || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
708  || detail::is_type_specialisation_of_v<
709  std::remove_cvref_t<decltype(std::get<0>(mate))>,
710  std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
711  || detail::is_type_specialisation_of_v<
712  std::remove_cvref_t<decltype(std::get<1>(mate))>,
713  std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
714  "The mate object must be a std::tuple of size 3 with "
715  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
716  "2) a std::integral or std::optional<std::integral>, and "
717  "3) a std::integral.");
718 
719  static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
720  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
721 
722  if constexpr (detail::decays_to_ignore_v<header_type>)
723  {
724  throw format_error{"BAM can only be written with a header but you did not provide enough information! "
725  "You can either construct the output file with ref_ids and ref_seqs information and "
726  "the header will be created for you, or you can access the `header` member directly."};
727  }
728  else
729  {
730  // ---------------------------------------------------------------------
731  // logical Requirements
732  // ---------------------------------------------------------------------
733 
734  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
735  throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
736 
737  detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
738 
739  // ---------------------------------------------------------------------
740  // Writing the BAM Header on first call
741  // ---------------------------------------------------------------------
742  if (!header_was_written)
743  {
744  stream << "BAM\1";
746  write_header(os, options, header); // write SAM header to temporary stream to query the size.
747  int32_t l_text{static_cast<int32_t>(os.str().size())};
748  std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
749 
750  stream << os.str();
751 
752  int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
753  std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
754 
755  for (int32_t ridx = 0; ridx < n_ref; ++ridx)
756  {
757  int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
758  std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
759  // write reference name:
760  std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
761  stream_it = '\0';
762  // write reference sequence length:
763  std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
764  }
765 
766  header_was_written = true;
767  }
768 
769  // ---------------------------------------------------------------------
770  // Writing the Record
771  // ---------------------------------------------------------------------
772  int32_t ref_length{};
773 
774  // if alignment is non-empty, replace cigar_vector.
775  // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
776  if (!std::ranges::empty(cigar_vector))
777  {
778  int32_t dummy_seq_length{};
779  for (auto & [count, operation] : cigar_vector)
780  detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
781  }
782  else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
783  {
784  ref_length = std::ranges::distance(get<1>(align));
785 
786  // compute possible distance from alignment end to sequence end
787  // which indicates soft clipping at the end.
788  // This should be replaced by a free count_gaps function for
789  // aligned sequences which is more efficient if possible.
790  int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
791 
792  for (auto chr : get<1>(align))
793  if (chr == gap{})
794  ++off_end;
795 
796  off_end -= ref_length;
797  cigar_vector = detail::get_cigar_vector(align, offset, off_end);
798  }
799 
800  if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
801  {
802  tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
803  cigar_vector.resize(2);
804  cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
805  cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
806  }
807 
808  std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
809 
810  // Compute the value for the l_read_name field for the bam record.
811  // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
812  // the data type to store the value is uint8_t and 255 is the maximal size.
813  // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
814  // 2 bytes.
815  uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
816  read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
817 
818  alignment_record_core core{/* block_size */ 0, // will be initialised right after
819  /* refID */ -1, // will be initialised right after
820  /* pos */ ref_offset.value_or(-1),
821  /* l_read_name */ read_name_size,
822  /* mapq */ mapq,
823  /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
824  /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
825  /* flag */ flag,
826  /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
827  /* next_refId */ -1, // will be initialised right after
828  /* next_pos */ get<1>(mate).value_or(-1),
829  /* tlen */ get<2>(mate)};
830 
831  auto check_and_assign_id_to = [&header]([[maybe_unused]] auto & id_source, [[maybe_unused]] auto & id_target)
832  {
833  using id_t = std::remove_reference_t<decltype(id_source)>;
834 
835  if constexpr (!detail::decays_to_ignore_v<id_t>)
836  {
837  if constexpr (std::integral<id_t>)
838  {
839  id_target = id_source;
840  }
841  else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
842  {
843  id_target = id_source.value_or(-1);
844  }
845  else
846  {
847  if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
848  {
849  auto id_it = header.ref_dict.end();
850 
851  if constexpr (std::ranges::contiguous_range<decltype(id_source)>
852  && std::ranges::sized_range<decltype(id_source)>
853  && std::ranges::borrowed_range<decltype(id_source)>)
854  {
855  id_it = header.ref_dict.find(
856  std::span{std::ranges::data(id_source), std::ranges::size(id_source)});
857  }
858  else
859  {
860  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
861 
862  static_assert(
863  implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
864  "The ref_id type is not convertible to the reference id information stored in the "
865  "reference dictionary of the header object.");
866 
867  id_it = header.ref_dict.find(id_source);
868  }
869 
870  if (id_it == header.ref_dict.end())
871  {
872  throw format_error{detail::to_string("Unknown reference name '",
873  id_source,
874  "' could "
875  "not be found in BAM header ref_dict: ",
876  header.ref_dict,
877  ".")};
878  }
879 
880  id_target = id_it->second;
881  }
882  }
883  }
884  };
885 
886  // initialise core.refID
887  check_and_assign_id_to(ref_id, core.refID);
888 
889  // initialise core.next_refID
890  check_and_assign_id_to(get<0>(mate), core.next_refID);
891 
892  // initialise core.block_size
893  core.block_size = sizeof(core) - 4 /*block_size excluded*/ + core.l_read_name + core.n_cigar_op * 4
894  + // each int32_t has 4 bytes
895  (core.l_seq + 1) / 2 + // bitcompressed seq
896  core.l_seq + // quality string
897  tag_dict_binary_str.size();
898 
899  std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
900 
901  if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
902  stream_it = '*';
903  else
904  std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
905  stream_it = '\0';
906 
907  // write cigar
908  for (auto [cigar_count, op] : cigar_vector)
909  {
910  cigar_count = cigar_count << 4;
911  cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
912  std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
913  }
914 
915  // write seq (bit-compressed: dna16sam characters go into one byte)
916  using alph_t = std::ranges::range_value_t<seq_type>;
917  constexpr auto to_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
918 
919  auto sit = std::ranges::begin(seq);
920  for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
921  {
922  uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
923  ++sidx, ++sit;
924  compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
925  stream_it = static_cast<char>(compressed_chr);
926  }
927 
928  if (core.l_seq & 1) // write one more
929  stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
930 
931  // write qual
932  if (std::ranges::empty(qual))
933  {
934  auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
935  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
936  }
937  else
938  {
939  if (std::ranges::distance(qual) != core.l_seq)
940  throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
941  core.l_seq,
942  ". Got quality with size ",
943  std::ranges::distance(qual),
944  " instead.")};
945 
946  auto v = qual
948  [](auto chr)
949  {
950  return static_cast<char>(to_rank(chr));
951  });
952  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
953  }
954 
955  // write optional fields
956  stream << tag_dict_binary_str;
957  } // if constexpr (!detail::decays_to_ignore_v<header_type>)
958 }
959 
961 template <typename stream_view_type, typename value_type>
962 inline void format_bam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
963  stream_view_type && stream_view,
964  value_type const & SEQAN3_DOXYGEN_ONLY(value))
965 {
966  int32_t count;
967  read_integral_byte_field(stream_view, count); // read length of vector
968  std::vector<value_type> tmp_vector;
969  tmp_vector.reserve(count);
970 
971  while (count > 0)
972  {
973  value_type tmp{};
974  if constexpr (std::integral<value_type>)
975  {
976  read_integral_byte_field(stream_view, tmp);
977  }
978  else if constexpr (std::same_as<value_type, float>)
979  {
980  read_float_byte_field(stream_view, tmp);
981  }
982  else
983  {
984  constexpr bool always_false = std::is_same_v<value_type, void>;
985  static_assert(always_false, "format_bam::read_sam_dict_vector: unsupported value_type");
986  }
987  tmp_vector.push_back(std::move(tmp));
988  --count;
989  }
990  variant = std::move(tmp_vector);
991 }
992 
1010 template <typename stream_view_type>
1011 inline void format_bam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
1012 {
1013  /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
1014  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
1015  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
1016  VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
1017  by the length (int32_t) of the array, followed by the values.
1018  */
1019  auto it = std::ranges::begin(stream_view);
1020  uint16_t tag = static_cast<uint16_t>(*it) << 8;
1021  ++it; // skip char read before
1022 
1023  tag += static_cast<uint16_t>(*it);
1024  ++it; // skip char read before
1025 
1026  char type_id = *it;
1027  ++it; // skip char read before
1028 
1029  switch (type_id)
1030  {
1031  case 'A': // char
1032  {
1033  target[tag] = *it;
1034  ++it; // skip char that has been read
1035  break;
1036  }
1037  // all integer sizes are possible
1038  case 'c': // int8_t
1039  {
1040  int8_t tmp;
1041  read_integral_byte_field(stream_view, tmp);
1042  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1043  break;
1044  }
1045  case 'C': // uint8_t
1046  {
1047  uint8_t tmp;
1048  read_integral_byte_field(stream_view, tmp);
1049  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1050  break;
1051  }
1052  case 's': // int16_t
1053  {
1054  int16_t tmp;
1055  read_integral_byte_field(stream_view, tmp);
1056  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1057  break;
1058  }
1059  case 'S': // uint16_t
1060  {
1061  uint16_t tmp;
1062  read_integral_byte_field(stream_view, tmp);
1063  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1064  break;
1065  }
1066  case 'i': // int32_t
1067  {
1068  int32_t tmp;
1069  read_integral_byte_field(stream_view, tmp);
1070  target[tag] = std::move(tmp); // readable sam format only allows int32_t
1071  break;
1072  }
1073  case 'I': // uint32_t
1074  {
1075  uint32_t tmp;
1076  read_integral_byte_field(stream_view, tmp);
1077  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1078  break;
1079  }
1080  case 'f': // float
1081  {
1082  float tmp;
1083  read_float_byte_field(stream_view, tmp);
1084  target[tag] = tmp;
1085  break;
1086  }
1087  case 'Z': // string
1088  {
1089  string_buffer.clear();
1090  while (!is_char<'\0'>(*it))
1091  {
1092  string_buffer.push_back(*it);
1093  ++it;
1094  }
1095  ++it; // skip \0
1096  target[tag] = string_buffer;
1097  break;
1098  }
1099  case 'H': // byte array, represented as null-terminated string; specification requires even number of bytes
1100  {
1101  std::vector<std::byte> byte_array;
1102  std::byte value;
1103  while (!is_char<'\0'>(*it))
1104  {
1105  string_buffer.clear();
1106  string_buffer.push_back(*it);
1107  ++it;
1108 
1109  if (*it == '\0')
1110  throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1111 
1112  string_buffer.push_back(*it);
1113  ++it;
1114  read_byte_field(string_buffer, value);
1115  byte_array.push_back(value);
1116  }
1117  ++it; // skip \0
1118  target[tag] = byte_array;
1119  break;
1120  }
1121  case 'B': // Array. Value type depends on second char [cCsSiIf]
1122  {
1123  char array_value_type_id = *it;
1124  ++it; // skip char read before
1125 
1126  switch (array_value_type_id)
1127  {
1128  case 'c': // int8_t
1129  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1130  break;
1131  case 'C': // uint8_t
1132  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1133  break;
1134  case 's': // int16_t
1135  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1136  break;
1137  case 'S': // uint16_t
1138  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1139  break;
1140  case 'i': // int32_t
1141  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1142  break;
1143  case 'I': // uint32_t
1144  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1145  break;
1146  case 'f': // float
1147  read_sam_dict_vector(target[tag], stream_view, float{});
1148  break;
1149  default:
1150  throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1151  "must be one of [cCsSiIf] but '",
1152  array_value_type_id,
1153  "' was given.")};
1154  }
1155  break;
1156  }
1157  default:
1158  throw format_error{detail::to_string("The second character in the numerical id of a "
1159  "SAM tag must be one of [A,i,Z,H,B,f] but '",
1160  type_id,
1161  "' was given.")};
1162  }
1163 }
1164 
1179 template <typename cigar_input_type>
1180 inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1181 {
1182  std::vector<cigar> operations{};
1183  char operation{'\0'};
1184  uint32_t count{};
1185  int32_t ref_length{}, seq_length{};
1186  uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1187  constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1188  constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1189 
1190  if (n_cigar_op == 0) // [[unlikely]]
1191  return std::tuple{operations, ref_length, seq_length};
1192 
1193  // parse the rest of the cigar
1194  // -------------------------------------------------------------------------------------------------------------
1195  while (n_cigar_op > 0) // until stream is not empty
1196  {
1197  std::ranges::copy_n(std::ranges::begin(cigar_input),
1198  sizeof(operation_and_count),
1199  reinterpret_cast<char *>(&operation_and_count));
1200  operation = cigar_mapping[operation_and_count & cigar_mask];
1201  count = operation_and_count >> 4;
1202 
1203  detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1204  operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1205  --n_cigar_op;
1206  }
1207 
1208  return std::tuple{operations, ref_length, seq_length};
1209 }
1210 
1214 inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary const & tag_dict)
1215 {
1216  std::string result{};
1217 
1218  auto stream_variant_fn = [&result](auto && arg) // helper to print a std::variant
1219  {
1220  // T is either char, int32_t, float, std::string, or a std::vector<some int>
1221  using T = std::remove_cvref_t<decltype(arg)>;
1222 
1223  if constexpr (std::same_as<T, int32_t>)
1224  {
1225  // always choose the smallest possible representation [cCsSiI]
1226  size_t const absolute_arg = std::abs(arg);
1227  auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1228  bool const negative = arg < 0;
1229  n = n * n + 2 * negative; // for switch case order
1230 
1231  switch (n)
1232  {
1233  case 0:
1234  {
1235  result[result.size() - 1] = 'C';
1236  result.append(reinterpret_cast<char const *>(&arg), 1);
1237  break;
1238  }
1239  case 1:
1240  {
1241  result[result.size() - 1] = 'S';
1242  result.append(reinterpret_cast<char const *>(&arg), 2);
1243  break;
1244  }
1245  case 2:
1246  {
1247  result[result.size() - 1] = 'c';
1248  int8_t tmp = static_cast<int8_t>(arg);
1249  result.append(reinterpret_cast<char const *>(&tmp), 1);
1250  break;
1251  }
1252  case 3:
1253  {
1254  result[result.size() - 1] = 's';
1255  int16_t tmp = static_cast<int16_t>(arg);
1256  result.append(reinterpret_cast<char const *>(&tmp), 2);
1257  break;
1258  }
1259  default:
1260  {
1261  result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1262  break;
1263  }
1264  }
1265  }
1266  else if constexpr (std::same_as<T, std::string>)
1267  {
1268  result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1 /*+ null character*/);
1269  }
1270  else if constexpr (!std::ranges::range<T>) // char, float
1271  {
1272  result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1273  }
1274  else // std::vector of some arithmetic_type type
1275  {
1276  int32_t sz{static_cast<int32_t>(arg.size())};
1277  result.append(reinterpret_cast<char *>(&sz), 4);
1278  result.append(reinterpret_cast<char const *>(arg.data()),
1279  arg.size() * sizeof(std::ranges::range_value_t<T>));
1280  }
1281  };
1282 
1283  for (auto & [tag, variant] : tag_dict)
1284  {
1285  result.push_back(static_cast<char>(tag / 256));
1286  result.push_back(static_cast<char>(tag % 256));
1287 
1288  result.push_back(detail::sam_tag_type_char[variant.index()]);
1289 
1290  if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1291  result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1292 
1293  std::visit(stream_variant_fn, variant);
1294  }
1295 
1296  return result;
1297 }
1298 
1299 } // namespace seqan3
The <bit> header from C++20's standard library.
constexpr derived_type & assign_char(char_type const chr) noexcept requires(!std
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:163
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:187
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: alphabet/cigar/cigar.hpp:60
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: alphabet/cigar/cigar.hpp:98
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=').
Definition: dna16sam.hpp:48
The BAM format.
Definition: format_bam.hpp:50
format_bam()=default
Defaulted.
void write_alignment_record([[maybe_unused]] stream_type &stream, [[maybe_unused]] sam_file_output_options const &options, [[maybe_unused]] header_type &&header, [[maybe_unused]] seq_type &&seq, [[maybe_unused]] qual_type &&qual, [[maybe_unused]] id_type &&id, [[maybe_unused]] int32_t const offset, [[maybe_unused]] ref_seq_type &&ref_seq, [[maybe_unused]] ref_id_type &&ref_id, [[maybe_unused]] std::optional< int32_t > ref_offset, [[maybe_unused]] align_type &&align, [[maybe_unused]] cigar_type &&cigar_vector, [[maybe_unused]] sam_flag const flag, [[maybe_unused]] uint8_t const mapq, [[maybe_unused]] mate_type &&mate, [[maybe_unused]] tag_dict_type &&tag_dict, [[maybe_unused]] double e_value, [[maybe_unused]] double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:643
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
~format_bam()=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:260
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:66
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:34
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:182
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:179
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:143
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:343
T clear(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T emplace_back(T... args)
T end(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: alphabet/concept.hpp:155
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: type_list/traits.hpp:469
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: type_pack/traits.hpp:164
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:178
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
T min(T... args)
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::debug_stream and related types.
T push_back(T... args)
The <ranges> header from C++20's standard library.
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_file_input_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Thrown if information given to output format didn't match expectations.
Definition: io/exception.hpp:91
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sam_file/input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: sam_file/output_options.hpp:26
T swap(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
T tie(T... args)
T tuple_size_v
T visit(T... args)