68 format_sam_base() noexcept = default;
69 format_sam_base(format_sam_base const &) noexcept = default;
70 format_sam_base & operator=(format_sam_base const &) noexcept = default;
71 format_sam_base(format_sam_base &&) noexcept = default;
72 format_sam_base & operator=(format_sam_base &&) noexcept = default;
73 ~format_sam_base() noexcept = default;
77 static constexpr
std::array format_version{
'1',
'.',
'6'};
83 bool header_was_written{
false};
86 bool ref_info_present_in_header{
false};
88 template <
typename ref_id_type,
89 typename ref_id_tmp_type,
91 typename ref_seqs_type>
92 void check_and_assign_ref_id(ref_id_type &
ref_id,
93 ref_id_tmp_type & ref_id_tmp,
97 static void update_alignment_lengths(int32_t & ref_length,
99 char const cigar_operation,
100 uint32_t
const cigar_count);
102 template <
typename align_type,
typename ref_seqs_type>
103 void construct_alignment(align_type & align,
105 [[maybe_unused]] int32_t rid,
106 [[maybe_unused]] ref_seqs_type & ref_seqs,
107 [[maybe_unused]] int32_t ref_start,
110 void transfer_soft_clipping_to(
std::vector<cigar> const & cigar_vector, int32_t & sc_begin, int32_t & sc_end)
const;
112 template <
typename cigar_input_type>
115 template <
typename stream_view_type>
116 void read_field(stream_view_type && stream_view, detail::ignore_t
const & SEQAN3_DOXYGEN_ONLY(target));
118 template <
typename stream_view_type, std::ranges::forward_range target_range_type>
119 void read_field(stream_view_type && stream_view, target_range_type & target);
121 template <
typename stream_view_t, arithmetic arithmetic_target_type>
122 void read_field(stream_view_t && stream_view, arithmetic_target_type & arithmetic_target);
124 template <
typename stream_view_type,
typename optional_value_type>
127 template <
typename stream_view_type,
typename ref_
ids_type,
typename ref_seqs_type>
128 void read_header(stream_view_type && stream_view,
129 alignment_file_header<ref_ids_type> & hdr,
132 template <
typename stream_t,
typename ref_
ids_type>
133 void write_header(stream_t & stream,
134 alignment_file_output_options
const & options,
135 alignment_file_header<ref_ids_type> & header);
148 template <
typename ref_id_type,
149 typename ref_id_tmp_type,
150 typename header_type,
151 typename ref_seqs_type>
152 inline void format_sam_base::check_and_assign_ref_id(ref_id_type &
ref_id,
153 ref_id_tmp_type & ref_id_tmp,
154 header_type & header,
157 if (!std::ranges::empty(ref_id_tmp))
159 auto search = header.ref_dict.find(ref_id_tmp);
161 if (
search == header.ref_dict.end())
163 if constexpr(detail::decays_to_ignore_v<ref_seqs_type>)
165 if (ref_info_present_in_header)
167 throw format_error{
"Unknown reference id found in record which is not present in the header."};
171 header.ref_ids().push_back(ref_id_tmp);
173 header.ref_dict[header.ref_ids()[pos]] = pos;
179 throw format_error{
"Unknown reference id found in record which is not present in the given ids."};
195 inline void format_sam_base::update_alignment_lengths(int32_t & ref_length,
196 int32_t & seq_length,
197 char const cigar_operation,
198 uint32_t
const cigar_count)
200 switch (cigar_operation)
202 case 'M':
case '=':
case 'X': ref_length += cigar_count, seq_length += cigar_count;
break;
203 case 'D':
case 'N': ref_length += cigar_count;
break;
204 case 'I' : seq_length += cigar_count;
break;
205 case 'S':
case 'H':
case 'P':
break;
206 default:
throw format_error{
"Illegal cigar operation: " +
std::string{cigar_operation}};
215 inline void format_sam_base::transfer_soft_clipping_to(
std::vector<cigar> const & cigar_vector,
217 int32_t & sc_end)
const 220 auto soft_clipping_at = [&] (
size_t const index) {
return cigar_vector[index] ==
'S'_cigar_op; };
222 auto hard_clipping_at = [&] (
size_t const index) {
return cigar_vector[index] ==
'H'_cigar_op; };
224 auto vector_size_at_least = [&] (
size_t const min_size) {
return cigar_vector.
size() >= min_size; };
226 auto cigar_count_at = [&] (
size_t const index) {
return get<0>(cigar_vector[index]); };
229 if (vector_size_at_least(1) && soft_clipping_at(0))
230 sc_begin = cigar_count_at(0);
231 else if (vector_size_at_least(2) && hard_clipping_at(0) && soft_clipping_at(1))
232 sc_begin = cigar_count_at(1);
237 auto last_index = cigar_vector.
size() - 1;
238 auto second_last_index = last_index - 1;
240 if (vector_size_at_least(2) && soft_clipping_at(last_index))
241 sc_end = cigar_count_at(last_index);
242 else if (vector_size_at_least(3) && hard_clipping_at(last_index) && soft_clipping_at(second_last_index))
243 sc_end = cigar_count_at(second_last_index);
259 template <
typename cigar_input_type>
264 char cigar_operation{};
265 uint32_t cigar_count{};
266 int32_t ref_length{}, seq_length{};
273 while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view))
276 cigar_operation = *std::ranges::begin(cigar_view);
277 std::ranges::next(std::ranges::begin(cigar_view));
280 throw format_error{
"Corrupted cigar string encountered"};
282 update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
283 operations.emplace_back(cigar_count, cigar_op{}.assign_char(cigar_operation));
286 return {operations, ref_length, seq_length};
299 template <
typename align_type,
typename ref_seqs_type>
300 inline void format_sam_base::construct_alignment(align_type & align,
302 [[maybe_unused]] int32_t rid,
303 [[maybe_unused]] ref_seqs_type & ref_seqs,
304 [[maybe_unused]] int32_t ref_start,
307 if (rid > -1 && ref_start > -1 &&
308 !cigar_vector.
empty() &&
309 !std::ranges::empty(get<1>(align)))
311 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
313 assert(static_cast<size_t>(ref_start + ref_length) <=
std::ranges::size(ref_seqs[rid]));
319 using unaligned_t = remove_cvref_t<detail::unaligned_seq_t<decltype(get<0>(align))>>;
320 auto dummy_seq =
views::repeat_n(value_type_t<unaligned_t>{}, ref_length)
322 static_assert(
std::same_as<unaligned_t, decltype(dummy_seq)>,
323 "No reference information was given so the type of the first alignment tuple position" 324 "must have an unaligned sequence type of a dummy sequence (" 325 "views::repeat_n(dna5{}, size_t{}) | " 326 "std::views::transform(detail::access_restrictor_fn{}))");
332 detail::alignment_from_cigar(align, cigar_vector);
336 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
343 using unaligned_t = remove_cvref_t<detail::unaligned_seq_t<decltype(get<0>(align))>>;
356 template <
typename stream_view_type>
357 inline void format_sam_base::read_field(stream_view_type && stream_view,
358 detail::ignore_t
const & SEQAN3_DOXYGEN_ONLY(target))
360 detail::consume(stream_view);
370 template <
typename stream_view_type, std::ranges::forward_range target_range_type>
371 inline void format_sam_base::read_field(stream_view_type && stream_view, target_range_type & target)
373 if (!is_char<'*'>(*std::ranges::begin(stream_view)))
374 std::ranges::copy(stream_view |
views::char_to<value_type_t<target_range_type>>,
375 std::ranges::back_inserter(target));
377 std::ranges::next(std::ranges::begin(stream_view));
390 template <
typename stream_view_t, arithmetic arithmetic_target_type>
391 inline void format_sam_base::read_field(stream_view_t && stream_view, arithmetic_target_type & arithmetic_target)
394 auto [ignore,
end] = std::ranges::copy(stream_view, arithmetic_buffer.data());
398 if (res.ec == std::errc::invalid_argument || res.ptr != end)
399 throw format_error{
std::string(
"[CORRUPTED SAM FILE] The string '") +
401 "' could not be cast into type " +
402 detail::type_name_as_string<arithmetic_target_type>};
404 if (res.ec == std::errc::result_out_of_range)
406 "' into type " + detail::type_name_as_string<arithmetic_target_type> +
407 " would cause an overflow."};
420 template <
typename stream_view_type,
typename optional_value_type>
423 optional_value_type tmp;
424 read_field(std::forward<stream_view_type>(stream_view), tmp);
444 template <
typename stream_view_type,
typename ref_
ids_type,
typename ref_seqs_type>
445 inline void format_sam_base::read_header(stream_view_type && stream_view,
446 alignment_file_header<ref_ids_type> & hdr,
449 auto parse_tag_value = [&stream_view,
this] (
auto & value)
452 std::ranges::next(std::ranges::begin(stream_view));
456 while (is_char<'@'>(*std::ranges::begin(stream_view)))
458 std::ranges::next(std::ranges::begin(stream_view));
460 if (is_char<'H'>(*std::ranges::begin(stream_view)))
462 parse_tag_value(hdr.format_version);
465 while (is_char<'\t'>(*std::ranges::begin(stream_view)))
467 std::ranges::next(std::ranges::begin(stream_view));
470 if (is_char<'S'>(*std::ranges::begin(stream_view)))
472 std::ranges::next(std::ranges::begin(stream_view));
474 if (is_char<'O'>(*std::ranges::begin(stream_view)))
476 else if (is_char<'S'>(*std::ranges::begin(stream_view)))
479 throw format_error{
std::string{
"Illegal SAM header tag: S"} +
480 std::string{
static_cast<char>(*std::ranges::begin(stream_view))}};
482 else if (!is_char<'G'>(*std::ranges::begin(stream_view)))
484 throw format_error{
std::string{
"Illegal SAM header tag in @HG starting with:"} +
485 std::string{
static_cast<char>(*std::ranges::begin(stream_view))}};
488 parse_tag_value(*who);
490 std::ranges::next(std::ranges::begin(stream_view));
492 else if (is_char<'S'>(*std::ranges::begin(stream_view)))
494 ref_info_present_in_header =
true;
499 std::ranges::next(std::ranges::begin(stream_view));
500 parse_tag_value(get<0>(info));
502 if (is_char<'\t'>(*std::ranges::begin(stream_view)))
504 std::ranges::next(std::ranges::begin(stream_view));
507 std::ranges::next(std::ranges::begin(stream_view));
511 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
513 auto id_it = hdr.ref_dict.find(
id);
515 if (id_it == hdr.ref_dict.end())
516 throw format_error{detail::to_string(
"Unknown reference name '",
id,
"' found in SAM header ",
517 "(header.ref_ids(): ", hdr.ref_ids(),
").")};
519 auto & given_ref_info = hdr.ref_id_info[id_it->second];
521 if (std::get<0>(given_ref_info) != std::get<0>(info))
522 throw format_error{
"Provided reference has unequal length as specified in the header."};
524 hdr.ref_id_info[id_it->second] =
std::move(info);
528 static_assert(!detail::is_type_specialisation_of_v<decltype(hdr.ref_ids()),
std::deque>,
529 "The range over reference ids must be of type std::deque such that " 530 "pointers are not invalidated.");
532 hdr.ref_ids().push_back(
id);
533 hdr.ref_id_info.push_back(info);
534 hdr.ref_dict[(hdr.ref_ids())[(hdr.ref_ids()).
size() - 1]] = (hdr.ref_ids()).
size() - 1;
537 else if (is_char<'R'>(*std::ranges::begin(stream_view)))
541 parse_tag_value(get<0>(tmp));
543 if (is_char<'\t'>(*std::ranges::begin(stream_view)))
545 std::ranges::next(std::ranges::begin(stream_view));
548 std::ranges::next(std::ranges::begin(stream_view));
550 hdr.read_groups.emplace_back(
std::move(tmp));
552 else if (is_char<'P'>(*std::ranges::begin(stream_view)))
554 typename alignment_file_header<ref_ids_type>::program_info_t tmp{};
556 parse_tag_value(tmp.id);
559 while (is_char<'\t'>(*std::ranges::begin(stream_view)))
561 std::ranges::next(std::ranges::begin(stream_view));
564 if (is_char<'P'>(*std::ranges::begin(stream_view)))
566 std::ranges::next(std::ranges::begin(stream_view));
568 if (is_char<'N'>(*std::ranges::begin(stream_view)))
573 else if (is_char<'C'>(*std::ranges::begin(stream_view)))
575 who = &tmp.command_line_call;
577 else if (is_char<'D'>(*std::ranges::begin(stream_view)))
579 who = &tmp.description;
581 else if (!is_char<'V'>(*std::ranges::begin(stream_view)))
583 throw format_error{
std::string{
"Illegal SAM header tag starting with:"} +
584 std::string{
static_cast<char>(*std::ranges::begin(stream_view))}};
587 parse_tag_value(*who);
589 std::ranges::next(std::ranges::begin(stream_view));
591 hdr.program_infos.emplace_back(
std::move(tmp));
593 else if (is_char<'C'>(*std::ranges::begin(stream_view)))
596 std::ranges::next(std::ranges::begin(stream_view));
597 std::ranges::next(std::ranges::begin(stream_view));
598 std::ranges::next(std::ranges::begin(stream_view));
600 std::ranges::next(std::ranges::begin(stream_view));
602 hdr.comments.emplace_back(
std::move(tmp));
606 throw format_error{
std::string{
"Illegal SAM header tag starting with:"} +
607 std::string{
static_cast<char>(*std::ranges::begin(stream_view))}};
628 template <
typename stream_t,
typename ref_
ids_type>
629 inline void format_sam_base::write_header(stream_t & stream,
630 alignment_file_output_options
const & options,
631 alignment_file_header<ref_ids_type> & header)
639 if (!header.sorting.empty() &&
640 !(header.sorting ==
"unknown" ||
641 header.sorting ==
"unsorted" ||
642 header.sorting ==
"queryname" ||
643 header.sorting ==
"coordinate" ))
644 throw format_error{
"SAM format error: The header.sorting member must be " 645 "one of [unknown, unsorted, queryname, coordinate]."};
647 if (!header.grouping.empty() &&
648 !(header.grouping ==
"none" ||
649 header.grouping ==
"query" ||
650 header.grouping ==
"reference"))
651 throw format_error{
"SAM format error: The header.grouping member must be " 652 "one of [none, query, reference]."};
671 stream <<
"@HD\tVN:";
672 std::ranges::copy(format_version, stream_it);
674 if (!header.sorting.empty())
675 stream <<
"\tSO:" << header.sorting;
677 if (!header.subsorting.empty())
678 stream <<
"\tSS:" << header.subsorting;
680 if (!header.grouping.empty())
681 stream <<
"\tGO:" << header.grouping;
683 detail::write_eol(stream_it, options.add_carriage_return);
686 for (
auto const & [ref_name, ref_info] : views::zip(header.ref_ids(), header.ref_id_info))
688 stream <<
"@SQ\tSN:";
690 std::ranges::copy(ref_name, stream_it);
692 stream <<
"\tLN:" << get<0>(ref_info);
694 if (!get<1>(ref_info).empty())
695 stream <<
"\t" << get<1>(ref_info);
697 detail::write_eol(stream_it, options.add_carriage_return);
701 for (
auto const & read_group : header.read_groups)
704 <<
"\tID:" << get<0>(read_group);
706 if (!get<1>(read_group).empty())
707 stream <<
"\t" << get<1>(read_group);
709 detail::write_eol(stream_it, options.add_carriage_return);
713 for (
auto const & program : header.program_infos)
716 <<
"\tID:" << program.id;
718 if (!program.name.empty())
719 stream <<
"\tPN:" << program.name;
721 if (!program.command_line_call.empty())
722 stream <<
"\tCL:" << program.command_line_call;
724 if (!program.previous.empty())
725 stream <<
"\tPP:" << program.previous;
727 if (!program.description.empty())
728 stream <<
"\tDS:" << program.description;
730 if (!program.version.empty())
731 stream <<
"\tVN:" << program.version;
733 detail::write_eol(stream_it, options.add_carriage_return);
737 for (
auto const &
comment : header.comments)
740 detail::write_eol(stream_it, options.add_carriage_return);
auto const char_to
A view over an alphabet, given a range of characters.
Definition: char_to.hpp:69
Provides concepts for core language types and relations that don't have concepts in C++20 (yet)...
void assign_unaligned(aligned_seq_t &aligned_seq, unaligned_sequence_type &&unaligned_seq)
An implementation of seqan3::aligned_sequence::assign_unaligned_sequence for sequence containers...
Definition: aligned_sequence_concept.hpp:375
typename value_type< t >::type value_type_t
Shortcut for seqan3::value_type (transformation_trait shortcut).
Definition: pre.hpp:48
Provides seqan3::views::istreambuf.
Provides seqan3::views::zip.
auto search(queries_t &&queries, index_t const &index, configuration_t const &cfg=search_cfg::default_configuration)
Search a query or a range of queries in an index.
Definition: search.hpp:167
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides traits to inspect some information of a type, for example its name.
Result type of std::from_chars.
Definition: charconv:61
::ranges::ostreambuf_iterator ostreambuf_iterator
Alias for ranges::ostreambuf_iterator. Writes successive characters onto the output stream from which...
Definition: iterator.hpp:204
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
Provides seqan3::type_list and auxiliary type traits.
constexpr auto single_pass_input
A view adapter that decays most of the range properties and adds single pass behavior.
Definition: single_pass_input.hpp:378
Provides seqan3::views::to.
SeqAn specific customisations in the standard namespace.
seqan3::type_list< trait_t< pack_t >... > transform
Apply a transformation trait to every type in the pack and return a seqan3::type_list of the results...
Definition: traits.hpp:307
Auxiliary for pretty printing of exception messages.
Auxiliary functions for the alignment IO.
Provides std::from_chars and std::to_chars if not defined in the stl <charconv> header.
Provides seqan3::alignment_file_output_options.
Provides seqan3::sequence_file_output_options.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::tuple_like.
Provides seqan3::views::repeat_n.
Provides various utility functions.
std::from_chars_result from_chars(char const *first, char const *last, value_type &value, int base) noexcept
Parse a char sequence into an integral.
Definition: charconv:935
Provides various utility functions.
auto constexpr is_digit
Checks whether c is a digital character.
Definition: predicate.hpp:287
Provides seqan3::views::char_to.
Comment field of arbitrary content, usually a string.
Adaptations of concepts from the Ranges TS.
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:116
Definition: aligned_sequence_concept.hpp:36
Provides character predicates for tokenisation.
Provides seqan3::ostream and seqan3::ostreambuf iterator.
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:68
Adaptations of algorithms from the Ranges TS.
Provides seqan3::views::slice.
Provides seqan3::views::to_char.
Provides various transformation traits used by the range module.
auto constexpr take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:599
The concept std::same_as<T, U> is satisfied if and only if T and U denote the same type...
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:141
The identifier of the (reference) sequence that SEQ was aligned to.
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:94