From 92b69215e63a3303a5e904ab332e2eec236e0ed2 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Mon, 16 Aug 2021 23:30:18 -0700
Subject: update libcxx, libcxxabi, libunwind, and tsan to llvm 13 rc1

---
 lib/libcxx/include/string | 355 +++++++++++++++++++++++-----------------------
 1 file changed, 179 insertions(+), 176 deletions(-)

(limited to 'lib/libcxx/include/string')
diff --git a/lib/libcxx/include/string b/lib/libcxx/include/string
index 687795c79b..4940021b0c 100644
--- a/lib/libcxx/include/string
+++ b/lib/libcxx/include/string
@@ -69,6 +69,9 @@ struct char_traits
 
 template <> struct char_traits<char>;
 template <> struct char_traits<wchar_t>;
+template <> struct char_traits<char8_t>;  // C++20
+template <> struct char_traits<char16_t>;
+template <> struct char_traits<char32_t>;
 
 template<class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
 class basic_string
@@ -107,6 +110,7 @@ public:
         explicit basic_string(const T& t, const Allocator& a = Allocator()); // C++17
     basic_string(const value_type* s, const allocator_type& a = allocator_type());
     basic_string(const value_type* s, size_type n, const allocator_type& a = allocator_type());
+    basic_string(nullptr_t) = delete; // C++2b
     basic_string(size_type n, value_type c, const allocator_type& a = allocator_type());
     template<class InputIterator>
         basic_string(InputIterator begin, InputIterator end,
@@ -127,6 +131,7 @@ public:
              allocator_type::propagate_on_container_move_assignment::value ||
              allocator_type::is_always_equal::value ); // C++17
     basic_string& operator=(const value_type* s);
+    basic_string& operator=(nullptr_t) = delete; // C++2b
     basic_string& operator=(value_type c);
     basic_string& operator=(initializer_list<value_type>);
 
@@ -262,49 +267,49 @@ public:
 
     size_type find(const basic_string& str, size_type pos = 0) const noexcept;
     template <class T>
-        size_type find(const T& t, size_type pos = 0) const;  // C++17
+        size_type find(const T& t, size_type pos = 0) const noexcept; // C++17, noexcept as an extension
     size_type find(const value_type* s, size_type pos, size_type n) const noexcept;
     size_type find(const value_type* s, size_type pos = 0) const noexcept;
     size_type find(value_type c, size_type pos = 0) const noexcept;
 
     size_type rfind(const basic_string& str, size_type pos = npos) const noexcept;
     template <class T>
-        size_type rfind(const T& t, size_type pos = npos) const;  // C++17
+        size_type rfind(const T& t, size_type pos = npos) const noexcept; // C++17, noexcept as an extension
     size_type rfind(const value_type* s, size_type pos, size_type n) const noexcept;
     size_type rfind(const value_type* s, size_type pos = npos) const noexcept;
     size_type rfind(value_type c, size_type pos = npos) const noexcept;
 
     size_type find_first_of(const basic_string& str, size_type pos = 0) const noexcept;
     template <class T>
-        size_type find_first_of(const T& t, size_type pos = 0) const; // C++17
+        size_type find_first_of(const T& t, size_type pos = 0) const noexcept; // C++17, noexcept as an extension
     size_type find_first_of(const value_type* s, size_type pos, size_type n) const noexcept;
     size_type find_first_of(const value_type* s, size_type pos = 0) const noexcept;
     size_type find_first_of(value_type c, size_type pos = 0) const noexcept;
 
     size_type find_last_of(const basic_string& str, size_type pos = npos) const noexcept;
     template <class T>
-        size_type find_last_of(const T& t, size_type pos = npos) const noexcept;  // C++17
+        size_type find_last_of(const T& t, size_type pos = npos) const noexcept noexcept; // C++17, noexcept as an extension
     size_type find_last_of(const value_type* s, size_type pos, size_type n) const noexcept;
     size_type find_last_of(const value_type* s, size_type pos = npos) const noexcept;
     size_type find_last_of(value_type c, size_type pos = npos) const noexcept;
 
     size_type find_first_not_of(const basic_string& str, size_type pos = 0) const noexcept;
     template <class T>
-        size_type find_first_not_of(const T& t, size_type pos = 0) const; // C++17
+        size_type find_first_not_of(const T& t, size_type pos = 0) const noexcept; // C++17, noexcept as an extension
     size_type find_first_not_of(const value_type* s, size_type pos, size_type n) const noexcept;
     size_type find_first_not_of(const value_type* s, size_type pos = 0) const noexcept;
     size_type find_first_not_of(value_type c, size_type pos = 0) const noexcept;
 
     size_type find_last_not_of(const basic_string& str, size_type pos = npos) const noexcept;
     template <class T>
-        size_type find_last_not_of(const T& t, size_type pos = npos) const; // C++17
+        size_type find_last_not_of(const T& t, size_type pos = npos) const noexcept; // C++17, noexcept as an extension
     size_type find_last_not_of(const value_type* s, size_type pos, size_type n) const noexcept;
     size_type find_last_not_of(const value_type* s, size_type pos = npos) const noexcept;
     size_type find_last_not_of(value_type c, size_type pos = npos) const noexcept;
 
     int compare(const basic_string& str) const noexcept;
     template <class T>
-        int compare(const T& t) const noexcept;  // C++17
+        int compare(const T& t) const noexcept;  // C++17, noexcept as an extension
     int compare(size_type pos1, size_type n1, const basic_string& str) const;
     template <class T>
         int compare(size_type pos1, size_type n1, const T& t) const;  // C++17
@@ -450,6 +455,7 @@ erase_if(basic_string<charT, traits, Allocator>& c, Predicate pred); // C++20
 
 typedef basic_string<char>    string;
 typedef basic_string<wchar_t> wstring;
+typedef basic_string<char8_t> u8string; // C++20
 typedef basic_string<char16_t> u16string;
 typedef basic_string<char32_t> u32string;
 
@@ -494,12 +500,14 @@ wstring to_wstring(double val);
 wstring to_wstring(long double val);
 
 template <> struct hash<string>;
+template <> struct hash<u8string>; // C++20
 template <> struct hash<u16string>;
 template <> struct hash<u32string>;
 template <> struct hash<wstring>;
 
 basic_string<char>     operator "" s( const char *str,     size_t len ); // C++14
 basic_string<wchar_t>  operator "" s( const wchar_t *str,  size_t len ); // C++14
+basic_string<char8_t>  operator "" s( const char8_t *str,  size_t len ); // C++20
 basic_string<char16_t> operator "" s( const char16_t *str, size_t len ); // C++14
 basic_string<char32_t> operator "" s( const char32_t *str, size_t len ); // C++14
 
@@ -508,26 +516,28 @@ basic_string<char32_t> operator "" s( const char32_t *str, size_t len ); // C++1
 */
 
 #include <__config>
-#include <string_view>
-#include <iosfwd>
+#include <__debug>
+#include <__functional_base>
+#include <__iterator/wrap_iter.h>
+#include <algorithm>
+#include <compare>
+#include <cstdio>  // EOF
 #include <cstring>
-#include <cstdio>  // For EOF.
 #include <cwchar>
-#include <algorithm>
+#include <initializer_list>
+#include <iosfwd>
 #include <iterator>
-#include <utility>
 #include <memory>
 #include <stdexcept>
+#include <string_view>
 #include <type_traits>
-#include <initializer_list>
-#include <__functional_base>
+#include <utility>
 #include <version>
+
 #ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
-#include <cstdint>
+# include <cstdint>
 #endif
 
-#include <__debug>
-
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
 #endif
@@ -625,29 +635,16 @@ __basic_string_common<__b>::__throw_out_of_range() const
 
 _LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __basic_string_common<true>)
 
-#ifdef _LIBCPP_NO_EXCEPTIONS
-template <class _Iter>
-struct __libcpp_string_gets_noexcept_iterator_impl : public true_type {};
-#elif defined(_LIBCPP_HAS_NO_NOEXCEPT)
 template <class _Iter>
-struct __libcpp_string_gets_noexcept_iterator_impl : public false_type {};
-#else
-template <class _Iter, bool = __is_cpp17_forward_iterator<_Iter>::value>
-struct __libcpp_string_gets_noexcept_iterator_impl : public _LIBCPP_BOOL_CONSTANT((
-    noexcept(++(declval<_Iter&>())) &&
-    is_nothrow_assignable<_Iter&, _Iter>::value &&
-    noexcept(declval<_Iter>() == declval<_Iter>()) &&
-    noexcept(*declval<_Iter>())
-)) {};
-
-template <class _Iter>
-struct __libcpp_string_gets_noexcept_iterator_impl<_Iter, false> : public false_type {};
-#endif
+struct __string_is_trivial_iterator : public false_type {};
 
+template <class _Tp>
+struct __string_is_trivial_iterator<_Tp*>
+    : public is_arithmetic<_Tp> {};
 
 template <class _Iter>
-struct __libcpp_string_gets_noexcept_iterator
-    : public _LIBCPP_BOOL_CONSTANT(__libcpp_is_trivial_iterator<_Iter>::value || __libcpp_string_gets_noexcept_iterator_impl<_Iter>::value) {};
+struct __string_is_trivial_iterator<__wrap_iter<_Iter> >
+    : public __string_is_trivial_iterator<_Iter> {};
 
 template <class _CharT, class _Traits, class _Tp>
 struct __can_be_converted_to_string_view : public _BoolConstant<
@@ -668,21 +665,21 @@ struct __padding<_CharT, 1>
 {
 };
 
-#endif  // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
+#endif // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
 
-#ifndef _LIBCPP_NO_HAS_CHAR8_T
+#ifndef _LIBCPP_HAS_NO_CHAR8_T
 typedef basic_string<char8_t> u8string;
 #endif
 
 #ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
 typedef basic_string<char16_t> u16string;
 typedef basic_string<char32_t> u32string;
-#endif  // _LIBCPP_HAS_NO_UNICODE_CHARS
+#endif // _LIBCPP_HAS_NO_UNICODE_CHARS
 
 template<class _CharT, class _Traits, class _Allocator>
 class
     _LIBCPP_TEMPLATE_VIS
-#ifndef _LIBCPP_NO_HAS_CHAR8_T
+#ifndef _LIBCPP_HAS_NO_CHAR8_T
     _LIBCPP_PREFERRED_NAME(u8string)
 #endif
 #ifndef _LIBCPP_HAS_NO_UNICODE_CHARS
@@ -736,7 +733,7 @@ private:
 #else  // _LIBCPP_BIG_ENDIAN
     static const size_type __short_mask = 0x80;
     static const size_type __long_mask  = ~(size_type(~0) >> 1);
-#endif  // _LIBCPP_BIG_ENDIAN
+#endif // _LIBCPP_BIG_ENDIAN
 
     enum {__min_cap = (sizeof(__long) - 1)/sizeof(value_type) > 2 ?
                       (sizeof(__long) - 1)/sizeof(value_type) : 2};
@@ -766,7 +763,7 @@ private:
 #else  // _LIBCPP_BIG_ENDIAN
     static const size_type __short_mask = 0x01;
     static const size_type __long_mask  = 0x1ul;
-#endif  // _LIBCPP_BIG_ENDIAN
+#endif // _LIBCPP_BIG_ENDIAN
 
     enum {__min_cap = (sizeof(__long) - 1)/sizeof(value_type) > 2 ?
                       (sizeof(__long) - 1)/sizeof(value_type) : 2};
@@ -781,7 +778,7 @@ private:
         value_type __data_[__min_cap];
     };
 
-#endif  // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
+#endif // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
 
     union __ulx{__long __lx; __short __lxx;};
 
@@ -805,7 +802,7 @@ private:
     __compressed_pair<__rep, allocator_type> __r_;
 
 public:
-    _LIBCPP_FUNC_VIS
+    _LIBCPP_TEMPLATE_DATA_VIS
     static const size_type npos = -1;
 
     _LIBCPP_INLINE_VISIBILITY basic_string()
@@ -832,7 +829,7 @@ public:
 
     _LIBCPP_INLINE_VISIBILITY
     basic_string(basic_string&& __str, const allocator_type& __a);
-#endif  // _LIBCPP_CXX03_LANG
+#endif // _LIBCPP_CXX03_LANG
 
     template <class = _EnableIf<__is_allocator<_Allocator>::value, nullptr_t> >
     _LIBCPP_INLINE_VISIBILITY
@@ -848,6 +845,10 @@ public:
         _LIBCPP_INLINE_VISIBILITY
         basic_string(const _CharT* __s, const _Allocator& __a);
 
+#if _LIBCPP_STD_VER > 20
+    basic_string(nullptr_t) = delete;
+#endif
+
     _LIBCPP_INLINE_VISIBILITY
     basic_string(const _CharT* __s, size_type __n);
     _LIBCPP_INLINE_VISIBILITY
@@ -890,7 +891,7 @@ public:
     basic_string(initializer_list<_CharT> __il);
     _LIBCPP_INLINE_VISIBILITY
     basic_string(initializer_list<_CharT> __il, const _Allocator& __a);
-#endif  // _LIBCPP_CXX03_LANG
+#endif // _LIBCPP_CXX03_LANG
 
     inline ~basic_string();
 
@@ -911,6 +912,9 @@ public:
     basic_string& operator=(initializer_list<value_type> __il) {return assign(__il.begin(), __il.size());}
 #endif
     _LIBCPP_INLINE_VISIBILITY basic_string& operator=(const value_type* __s) {return assign(__s);}
+#if _LIBCPP_STD_VER > 20
+    basic_string& operator=(nullptr_t) = delete;
+#endif
     basic_string& operator=(value_type __c);
 
 #if _LIBCPP_DEBUG_LEVEL == 2
@@ -939,7 +943,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     const_iterator end() const _NOEXCEPT
         {return const_iterator(__get_pointer() + size());}
-#endif  // _LIBCPP_DEBUG_LEVEL == 2
+#endif // _LIBCPP_DEBUG_LEVEL == 2
     _LIBCPP_INLINE_VISIBILITY
     reverse_iterator rbegin() _NOEXCEPT
         {return reverse_iterator(end());}
@@ -1010,7 +1014,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY basic_string& operator+=(value_type __c)            {push_back(__c); return *this;}
 #ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY basic_string& operator+=(initializer_list<value_type> __il) {return append(__il);}
-#endif  // _LIBCPP_CXX03_LANG
+#endif // _LIBCPP_CXX03_LANG
 
     _LIBCPP_INLINE_VISIBILITY
     basic_string& append(const basic_string& __str);
@@ -1041,20 +1045,16 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     void __append_default_init(size_type __n);
 
-    template <class _ForwardIterator>
-    _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
-    basic_string& __append_forward_unsafe(_ForwardIterator, _ForwardIterator);
     template<class _InputIterator>
     _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
     _EnableIf
         <
-            __is_exactly_cpp17_input_iterator<_InputIterator>::value
-                || !__libcpp_string_gets_noexcept_iterator<_InputIterator>::value,
+            __is_exactly_cpp17_input_iterator<_InputIterator>::value,
             basic_string&
         >
     _LIBCPP_INLINE_VISIBILITY
     append(_InputIterator __first, _InputIterator __last) {
-      const basic_string __temp (__first, __last, __alloc());
+      const basic_string __temp(__first, __last, __alloc());
       append(__temp.data(), __temp.size());
       return *this;
     }
@@ -1062,19 +1062,16 @@ public:
     _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
     _EnableIf
         <
-            __is_cpp17_forward_iterator<_ForwardIterator>::value
-                && __libcpp_string_gets_noexcept_iterator<_ForwardIterator>::value,
+            __is_cpp17_forward_iterator<_ForwardIterator>::value,
             basic_string&
         >
     _LIBCPP_INLINE_VISIBILITY
-    append(_ForwardIterator __first, _ForwardIterator __last) {
-      return __append_forward_unsafe(__first, __last);
-    }
+    append(_ForwardIterator __first, _ForwardIterator __last);
 
 #ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
     basic_string& append(initializer_list<value_type> __il) {return append(__il.begin(), __il.size());}
-#endif  // _LIBCPP_CXX03_LANG
+#endif // _LIBCPP_CXX03_LANG
 
     void push_back(value_type __c);
     _LIBCPP_INLINE_VISIBILITY
@@ -1117,8 +1114,7 @@ public:
     _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
     _EnableIf
         <
-           __is_exactly_cpp17_input_iterator<_InputIterator>::value
-                || !__libcpp_string_gets_noexcept_iterator<_InputIterator>::value,
+            __is_exactly_cpp17_input_iterator<_InputIterator>::value,
             basic_string&
         >
         assign(_InputIterator __first, _InputIterator __last);
@@ -1126,15 +1122,14 @@ public:
     _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
     _EnableIf
         <
-            __is_cpp17_forward_iterator<_ForwardIterator>::value
-                 && __libcpp_string_gets_noexcept_iterator<_ForwardIterator>::value,
+            __is_cpp17_forward_iterator<_ForwardIterator>::value,
             basic_string&
         >
         assign(_ForwardIterator __first, _ForwardIterator __last);
 #ifndef _LIBCPP_CXX03_LANG
     _LIBCPP_INLINE_VISIBILITY
     basic_string& assign(initializer_list<value_type> __il) {return assign(__il.begin(), __il.size());}
-#endif  // _LIBCPP_CXX03_LANG
+#endif // _LIBCPP_CXX03_LANG
 
     _LIBCPP_INLINE_VISIBILITY
     basic_string& insert(size_type __pos1, const basic_string& __str);
@@ -1168,8 +1163,7 @@ public:
     _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
     _EnableIf
         <
-           __is_exactly_cpp17_input_iterator<_InputIterator>::value
-                || !__libcpp_string_gets_noexcept_iterator<_InputIterator>::value,
+            __is_exactly_cpp17_input_iterator<_InputIterator>::value,
             iterator
         >
         insert(const_iterator __pos, _InputIterator __first, _InputIterator __last);
@@ -1177,8 +1171,7 @@ public:
     _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
     _EnableIf
         <
-            __is_cpp17_forward_iterator<_ForwardIterator>::value
-                 && __libcpp_string_gets_noexcept_iterator<_ForwardIterator>::value,
+            __is_cpp17_forward_iterator<_ForwardIterator>::value,
             iterator
         >
         insert(const_iterator __pos, _ForwardIterator __first, _ForwardIterator __last);
@@ -1186,7 +1179,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     iterator insert(const_iterator __pos, initializer_list<value_type> __il)
                     {return insert(__pos, __il.begin(), __il.end());}
-#endif  // _LIBCPP_CXX03_LANG
+#endif // _LIBCPP_CXX03_LANG
 
     basic_string& erase(size_type __pos = 0, size_type __n = npos);
     _LIBCPP_INLINE_VISIBILITY
@@ -1247,7 +1240,7 @@ public:
     _LIBCPP_INLINE_VISIBILITY
     basic_string& replace(const_iterator __i1, const_iterator __i2, initializer_list<value_type> __il)
         {return replace(__i1, __i2, __il.begin(), __il.end());}
-#endif  // _LIBCPP_CXX03_LANG
+#endif // _LIBCPP_CXX03_LANG
 
     size_type copy(value_type* __s, size_type __n, size_type __pos = 0) const;
     _LIBCPP_INLINE_VISIBILITY
@@ -1284,7 +1277,7 @@ public:
             __can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value,
             size_type
         >
-              find(const _Tp& __t, size_type __pos = 0) const;
+              find(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT;
     size_type find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT;
     _LIBCPP_INLINE_VISIBILITY
     size_type find(const value_type* __s, size_type __pos = 0) const _NOEXCEPT;
@@ -1300,7 +1293,7 @@ public:
             __can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value,
             size_type
         >
-              rfind(const _Tp& __t, size_type __pos = npos) const;
+              rfind(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT;
     size_type rfind(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT;
     _LIBCPP_INLINE_VISIBILITY
     size_type rfind(const value_type* __s, size_type __pos = npos) const _NOEXCEPT;
@@ -1316,7 +1309,7 @@ public:
             __can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value,
             size_type
         >
-              find_first_of(const _Tp& __t, size_type __pos = 0) const;
+              find_first_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT;
     size_type find_first_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT;
     _LIBCPP_INLINE_VISIBILITY
     size_type find_first_of(const value_type* __s, size_type __pos = 0) const _NOEXCEPT;
@@ -1333,7 +1326,7 @@ public:
             __can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value,
             size_type
         >
-              find_last_of(const _Tp& __t, size_type __pos = npos) const;
+              find_last_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT;
     size_type find_last_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT;
     _LIBCPP_INLINE_VISIBILITY
     size_type find_last_of(const value_type* __s, size_type __pos = npos) const _NOEXCEPT;
@@ -1350,7 +1343,7 @@ public:
             __can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value,
             size_type
         >
-              find_first_not_of(const _Tp &__t, size_type __pos = 0) const;
+              find_first_not_of(const _Tp &__t, size_type __pos = 0) const _NOEXCEPT;
     size_type find_first_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT;
     _LIBCPP_INLINE_VISIBILITY
     size_type find_first_not_of(const value_type* __s, size_type __pos = 0) const _NOEXCEPT;
@@ -1367,7 +1360,7 @@ public:
             __can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value,
             size_type
         >
-              find_last_not_of(const _Tp& __t, size_type __pos = npos) const;
+              find_last_not_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT;
     size_type find_last_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT;
     _LIBCPP_INLINE_VISIBILITY
     size_type find_last_not_of(const value_type* __s, size_type __pos = npos) const _NOEXCEPT;
@@ -1384,7 +1377,7 @@ public:
             __can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value,
             int
         >
-        compare(const _Tp &__t) const;
+        compare(const _Tp &__t) const _NOEXCEPT;
 
     template <class _Tp>
     _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
@@ -1468,7 +1461,7 @@ public:
     bool __addable(const const_iterator* __i, ptrdiff_t __n) const;
     bool __subscriptable(const const_iterator* __i, ptrdiff_t __n) const;
 
-#endif  // _LIBCPP_DEBUG_LEVEL == 2
+#endif // _LIBCPP_DEBUG_LEVEL == 2
 
 private:
     _LIBCPP_INLINE_VISIBILITY
@@ -1514,7 +1507,7 @@ private:
         {return __r_.first().__s.__size_ >> 1;}
 #   endif
 
-#endif  // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
+#endif // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
 
     _LIBCPP_INLINE_VISIBILITY
     void __set_long_size(size_type __s) _NOEXCEPT
@@ -1714,6 +1707,13 @@ private:
     _LIBCPP_INLINE_VISIBILITY void __invalidate_all_iterators();
     _LIBCPP_INLINE_VISIBILITY void __invalidate_iterators_past(size_type);
 
+    template<class _Tp>
+    _LIBCPP_INLINE_VISIBILITY
+    bool __addr_in_range(_Tp&& __t) const {
+        const volatile void *__p = _VSTD::addressof(__t);
+        return data() <= __p && __p <= data() + size();
+    }
+
     friend basic_string operator+<>(const basic_string&, const basic_string&);
     friend basic_string operator+<>(const value_type*, const basic_string&);
     friend basic_string operator+<>(value_type, const basic_string&);
@@ -1734,7 +1734,7 @@ _LIBCPP_STRING_V1_EXTERN_TEMPLATE_LIST(_LIBCPP_EXTERN_TEMPLATE, wchar_t)
 
 #ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
 template<class _InputIterator,
-         class _CharT = typename iterator_traits<_InputIterator>::value_type,
+         class _CharT = __iter_value_type<_InputIterator>,
          class _Allocator = allocator<_CharT>,
          class = _EnableIf<__is_cpp17_input_iterator<_InputIterator>::value>,
          class = _EnableIf<__is_allocator<_Allocator>::value>
@@ -1773,11 +1773,7 @@ basic_string<_CharT, _Traits, _Allocator>::__invalidate_all_iterators()
 template <class _CharT, class _Traits, class _Allocator>
 inline
 void
-basic_string<_CharT, _Traits, _Allocator>::__invalidate_iterators_past(size_type
-#if _LIBCPP_DEBUG_LEVEL == 2
-                                                                        __pos
-#endif
-                                                                      )
+basic_string<_CharT, _Traits, _Allocator>::__invalidate_iterators_past(size_type __pos)
 {
 #if _LIBCPP_DEBUG_LEVEL == 2
     __c_node* __c = __get_db()->__find_c_and_lock(this);
@@ -1797,7 +1793,9 @@ basic_string<_CharT, _Traits, _Allocator>::__invalidate_iterators_past(size_type
         }
         __get_db()->unlock();
     }
-#endif  // _LIBCPP_DEBUG_LEVEL == 2
+#else
+    (void)__pos;
+#endif // _LIBCPP_DEBUG_LEVEL == 2
 }
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -2001,7 +1999,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(basic_string&& __str, co
 #endif
 }
 
-#endif  // _LIBCPP_CXX03_LANG
+#endif // _LIBCPP_CXX03_LANG
 
 template <class _CharT, class _Traits, class _Allocator>
 void
@@ -2129,7 +2127,7 @@ basic_string<_CharT, _Traits, _Allocator>::__init(_InputIterator __first, _Input
 #ifndef _LIBCPP_NO_EXCEPTIONS
     try
     {
-#endif  // _LIBCPP_NO_EXCEPTIONS
+#endif // _LIBCPP_NO_EXCEPTIONS
     for (; __first != __last; ++__first)
         push_back(*__first);
 #ifndef _LIBCPP_NO_EXCEPTIONS
@@ -2140,7 +2138,7 @@ basic_string<_CharT, _Traits, _Allocator>::__init(_InputIterator __first, _Input
             __alloc_traits::deallocate(__alloc(), __get_long_pointer(), __get_long_cap());
         throw;
     }
-#endif  // _LIBCPP_NO_EXCEPTIONS
+#endif // _LIBCPP_NO_EXCEPTIONS
 }
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -2168,9 +2166,23 @@ basic_string<_CharT, _Traits, _Allocator>::__init(_ForwardIterator __first, _For
         __set_long_cap(__cap+1);
         __set_long_size(__sz);
     }
+
+#ifndef _LIBCPP_NO_EXCEPTIONS
+    try
+    {
+#endif  // _LIBCPP_NO_EXCEPTIONS
     for (; __first != __last; ++__first, (void) ++__p)
         traits_type::assign(*__p, *__first);
     traits_type::assign(*__p, value_type());
+#ifndef _LIBCPP_NO_EXCEPTIONS
+    }
+    catch (...)
+    {
+        if (__is_long())
+            __alloc_traits::deallocate(__alloc(), __get_long_pointer(), __get_long_cap());
+        throw;
+    }
+#endif  // _LIBCPP_NO_EXCEPTIONS
 }
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -2225,7 +2237,7 @@ basic_string<_CharT, _Traits, _Allocator>::basic_string(
 #endif
 }
 
-#endif  // _LIBCPP_CXX03_LANG
+#endif // _LIBCPP_CXX03_LANG
 
 template <class _CharT, class _Traits, class _Allocator>
 basic_string<_CharT, _Traits, _Allocator>::~basic_string()
@@ -2357,12 +2369,11 @@ basic_string<_CharT, _Traits, _Allocator>::assign(size_type __n, value_type __c)
         size_type __sz = size();
         __grow_by(__cap, __n - __cap, __sz, 0, __sz);
     }
-    else
-        __invalidate_iterators_past(__n);
     value_type* __p = _VSTD::__to_address(__get_pointer());
     traits_type::assign(__p, __n, __c);
     traits_type::assign(__p[__n], value_type());
     __set_size(__n);
+    __invalidate_iterators_past(__n);
     return *this;
 }
 
@@ -2463,8 +2474,7 @@ template <class _CharT, class _Traits, class _Allocator>
 template<class _InputIterator>
 _EnableIf
 <
-     __is_exactly_cpp17_input_iterator <_InputIterator>::value
-          || !__libcpp_string_gets_noexcept_iterator<_InputIterator>::value,
+     __is_exactly_cpp17_input_iterator<_InputIterator>::value,
     basic_string<_CharT, _Traits, _Allocator>&
 >
 basic_string<_CharT, _Traits, _Allocator>::assign(_InputIterator __first, _InputIterator __last)
@@ -2478,26 +2488,35 @@ template <class _CharT, class _Traits, class _Allocator>
 template<class _ForwardIterator>
 _EnableIf
 <
-    __is_cpp17_forward_iterator<_ForwardIterator>::value
-         && __libcpp_string_gets_noexcept_iterator<_ForwardIterator>::value,
+    __is_cpp17_forward_iterator<_ForwardIterator>::value,
     basic_string<_CharT, _Traits, _Allocator>&
 >
 basic_string<_CharT, _Traits, _Allocator>::assign(_ForwardIterator __first, _ForwardIterator __last)
 {
-    size_type __n = static_cast<size_type>(_VSTD::distance(__first, __last));
     size_type __cap = capacity();
-    if (__cap < __n)
+    size_type __n = __string_is_trivial_iterator<_ForwardIterator>::value ?
+        static_cast<size_type>(_VSTD::distance(__first, __last)) : 0;
+
+    if (__string_is_trivial_iterator<_ForwardIterator>::value &&
+        (__cap >= __n || !__addr_in_range(*__first)))
     {
-        size_type __sz = size();
-        __grow_by(__cap, __n - __cap, __sz, 0, __sz);
+        if (__cap < __n)
+        {
+            size_type __sz = size();
+            __grow_by(__cap, __n - __cap, __sz, 0, __sz);
+        }
+        pointer __p = __get_pointer();
+        for (; __first != __last; ++__first, ++__p)
+            traits_type::assign(*__p, *__first);
+        traits_type::assign(*__p, value_type());
+        __set_size(__n);
+        __invalidate_iterators_past(__n);
     }
     else
-        __invalidate_iterators_past(__n);
-    pointer __p = __get_pointer();
-    for (; __first != __last; ++__first, ++__p)
-        traits_type::assign(*__p, *__first);
-    traits_type::assign(*__p, value_type());
-    __set_size(__n);
+    {
+        const basic_string __temp(__first, __last, __alloc());
+        assign(__temp.data(), __temp.size());
+    }
     return *this;
 }
 
@@ -2644,39 +2663,23 @@ basic_string<_CharT, _Traits, _Allocator>::push_back(value_type __c)
     traits_type::assign(*++__p, value_type());
 }
 
-template <class _Tp>
-bool __ptr_in_range (const _Tp* __p, const _Tp* __first, const _Tp* __last)
-{
-    return __first <= __p && __p < __last;
-}
-
-template <class _Tp1, class _Tp2>
-bool __ptr_in_range (const _Tp1*, const _Tp2*, const _Tp2*)
-{
-    return false;
-}
-
 template <class _CharT, class _Traits, class _Allocator>
 template<class _ForwardIterator>
-basic_string<_CharT, _Traits, _Allocator>&
-basic_string<_CharT, _Traits, _Allocator>::__append_forward_unsafe(
+_EnableIf
+<
+    __is_cpp17_forward_iterator<_ForwardIterator>::value,
+    basic_string<_CharT, _Traits, _Allocator>&
+>
+basic_string<_CharT, _Traits, _Allocator>::append(
     _ForwardIterator __first, _ForwardIterator __last)
 {
-    static_assert(__is_cpp17_forward_iterator<_ForwardIterator>::value,
-                  "function requires a ForwardIterator");
     size_type __sz = size();
     size_type __cap = capacity();
     size_type __n = static_cast<size_type>(_VSTD::distance(__first, __last));
     if (__n)
     {
-        typedef typename iterator_traits<_ForwardIterator>::reference _CharRef;
-        _CharRef __tmp_ref = *__first;
-        if (__ptr_in_range(_VSTD::addressof(__tmp_ref), data(), data() + size()))
-        {
-            const basic_string __temp (__first, __last, __alloc());
-            append(__temp.data(), __temp.size());
-        }
-        else
+        if (__string_is_trivial_iterator<_ForwardIterator>::value &&
+            !__addr_in_range(*__first))
         {
             if (__cap - __sz < __n)
                 __grow_by(__cap, __sz + __n - __cap, __sz, __sz, 0);
@@ -2686,6 +2689,11 @@ basic_string<_CharT, _Traits, _Allocator>::__append_forward_unsafe(
             traits_type::assign(*__p, value_type());
             __set_size(__sz + __n);
         }
+        else
+        {
+            const basic_string __temp(__first, __last, __alloc());
+            append(__temp.data(), __temp.size());
+        }
     }
     return *this;
 }
@@ -2801,8 +2809,7 @@ template <class _CharT, class _Traits, class _Allocator>
 template<class _InputIterator>
 _EnableIf
 <
-   __is_exactly_cpp17_input_iterator<_InputIterator>::value
-        || !__libcpp_string_gets_noexcept_iterator<_InputIterator>::value,
+   __is_exactly_cpp17_input_iterator<_InputIterator>::value,
    typename basic_string<_CharT, _Traits, _Allocator>::iterator
 >
 basic_string<_CharT, _Traits, _Allocator>::insert(const_iterator __pos, _InputIterator __first, _InputIterator __last)
@@ -2820,8 +2827,7 @@ template <class _CharT, class _Traits, class _Allocator>
 template<class _ForwardIterator>
 _EnableIf
 <
-    __is_cpp17_forward_iterator<_ForwardIterator>::value
-        && __libcpp_string_gets_noexcept_iterator<_ForwardIterator>::value,
+    __is_cpp17_forward_iterator<_ForwardIterator>::value,
     typename basic_string<_CharT, _Traits, _Allocator>::iterator
 >
 basic_string<_CharT, _Traits, _Allocator>::insert(const_iterator __pos, _ForwardIterator __first, _ForwardIterator __last)
@@ -2835,34 +2841,35 @@ basic_string<_CharT, _Traits, _Allocator>::insert(const_iterator __pos, _Forward
     size_type __n = static_cast<size_type>(_VSTD::distance(__first, __last));
     if (__n)
     {
-        typedef typename iterator_traits<_ForwardIterator>::reference _CharRef;
-        _CharRef __tmp_char = *__first;
-        if (__ptr_in_range(_VSTD::addressof(__tmp_char), data(), data() + size()))
+        if (__string_is_trivial_iterator<_ForwardIterator>::value &&
+            !__addr_in_range(*__first))
         {
-            const basic_string __temp(__first, __last, __alloc());
-            return insert(__pos, __temp.data(), __temp.data() + __temp.size());
-        }
-
-        size_type __sz = size();
-        size_type __cap = capacity();
-        value_type* __p;
-        if (__cap - __sz >= __n)
-        {
-            __p = _VSTD::__to_address(__get_pointer());
-            size_type __n_move = __sz - __ip;
-            if (__n_move != 0)
-                traits_type::move(__p + __ip + __n, __p + __ip, __n_move);
+            size_type __sz = size();
+            size_type __cap = capacity();
+            value_type* __p;
+            if (__cap - __sz >= __n)
+            {
+                __p = _VSTD::__to_address(__get_pointer());
+                size_type __n_move = __sz - __ip;
+                if (__n_move != 0)
+                    traits_type::move(__p + __ip + __n, __p + __ip, __n_move);
+            }
+            else
+            {
+                __grow_by(__cap, __sz + __n - __cap, __sz, __ip, 0, __n);
+                __p = _VSTD::__to_address(__get_long_pointer());
+            }
+            __sz += __n;
+            __set_size(__sz);
+            traits_type::assign(__p[__sz], value_type());
+            for (__p += __ip; __first != __last; ++__p, ++__first)
+                traits_type::assign(*__p, *__first);
         }
         else
         {
-            __grow_by(__cap, __sz + __n - __cap, __sz, __ip, 0, __n);
-            __p = _VSTD::__to_address(__get_long_pointer());
+            const basic_string __temp(__first, __last, __alloc());
+            return insert(__pos, __temp.data(), __temp.data() + __temp.size());
         }
-        __sz += __n;
-        __set_size(__sz);
-        traits_type::assign(__p[__sz], value_type());
-        for (__p += __ip; __first != __last; ++__p, ++__first)
-            traits_type::assign(*__p, *__first);
     }
     return begin() + __ip;
 }
@@ -3353,7 +3360,7 @@ basic_string<_CharT, _Traits, _Allocator>::__shrink_or_extend(size_type __target
         #ifndef _LIBCPP_NO_EXCEPTIONS
             try
             {
-        #endif  // _LIBCPP_NO_EXCEPTIONS
+        #endif // _LIBCPP_NO_EXCEPTIONS
                 __new_data = __alloc_traits::allocate(__alloc(), __target_capacity+1);
         #ifndef _LIBCPP_NO_EXCEPTIONS
             }
@@ -3364,7 +3371,7 @@ basic_string<_CharT, _Traits, _Allocator>::__shrink_or_extend(size_type __target
         #else  // _LIBCPP_NO_EXCEPTIONS
             if (__new_data == nullptr)
                 return;
-        #endif  // _LIBCPP_NO_EXCEPTIONS
+        #endif // _LIBCPP_NO_EXCEPTIONS
         }
         __now_long = true;
         __was_long = __is_long();
@@ -3543,7 +3550,7 @@ _EnableIf
     typename basic_string<_CharT, _Traits, _Allocator>::size_type
 >
 basic_string<_CharT, _Traits, _Allocator>::find(const _Tp &__t,
-                                                size_type __pos) const
+                                                size_type __pos) const _NOEXCEPT
 {
     __self_view __sv = __t;
     return __str_find<value_type, size_type, traits_type, npos>
@@ -3601,7 +3608,7 @@ _EnableIf
     typename basic_string<_CharT, _Traits, _Allocator>::size_type
 >
 basic_string<_CharT, _Traits, _Allocator>::rfind(const _Tp& __t,
-                                                size_type __pos) const
+                                                size_type __pos) const _NOEXCEPT
 {
     __self_view __sv = __t;
     return __str_rfind<value_type, size_type, traits_type, npos>
@@ -3659,7 +3666,7 @@ _EnableIf
     typename basic_string<_CharT, _Traits, _Allocator>::size_type
 >
 basic_string<_CharT, _Traits, _Allocator>::find_first_of(const _Tp& __t,
-                                                size_type __pos) const
+                                                size_type __pos) const _NOEXCEPT
 {
     __self_view __sv = __t;
     return __str_find_first_of<value_type, size_type, traits_type, npos>
@@ -3717,7 +3724,7 @@ _EnableIf
     typename basic_string<_CharT, _Traits, _Allocator>::size_type
 >
 basic_string<_CharT, _Traits, _Allocator>::find_last_of(const _Tp& __t,
-                                                size_type __pos) const
+                                                size_type __pos) const _NOEXCEPT
 {
     __self_view __sv = __t;
     return __str_find_last_of<value_type, size_type, traits_type, npos>
@@ -3775,7 +3782,7 @@ _EnableIf
     typename basic_string<_CharT, _Traits, _Allocator>::size_type
 >
 basic_string<_CharT, _Traits, _Allocator>::find_first_not_of(const _Tp& __t,
-                                                size_type __pos) const
+                                                size_type __pos) const _NOEXCEPT
 {
     __self_view __sv = __t;
     return __str_find_first_not_of<value_type, size_type, traits_type, npos>
@@ -3834,7 +3841,7 @@ _EnableIf
     typename basic_string<_CharT, _Traits, _Allocator>::size_type
 >
 basic_string<_CharT, _Traits, _Allocator>::find_last_not_of(const _Tp& __t,
-                                                size_type __pos) const
+                                                size_type __pos) const _NOEXCEPT
 {
     __self_view __sv = __t;
     return __str_find_last_not_of<value_type, size_type, traits_type, npos>
@@ -3871,7 +3878,7 @@ _EnableIf
     __can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value,
     int
 >
-basic_string<_CharT, _Traits, _Allocator>::compare(const _Tp& __t) const
+basic_string<_CharT, _Traits, _Allocator>::compare(const _Tp& __t) const _NOEXCEPT
 {
     __self_view __sv = __t;
     size_t __lhs_sz = size();
@@ -4349,7 +4356,7 @@ operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, _CharT __rhs)
     return _VSTD::move(__lhs);
 }
 
-#endif  // _LIBCPP_CXX03_LANG
+#endif // _LIBCPP_CXX03_LANG
 
 // swap
 
@@ -4404,7 +4411,7 @@ _LIBCPP_FUNC_VIS wstring to_wstring(double __val);
 _LIBCPP_FUNC_VIS wstring to_wstring(long double __val);
 
 template<class _CharT, class _Traits, class _Allocator>
-_LIBCPP_FUNC_VIS
+_LIBCPP_TEMPLATE_DATA_VIS
 const typename basic_string<_CharT, _Traits, _Allocator>::size_type
                basic_string<_CharT, _Traits, _Allocator>::npos;
 
@@ -4441,8 +4448,6 @@ basic_istream<_CharT, _Traits>&
 getline(basic_istream<_CharT, _Traits>& __is,
         basic_string<_CharT, _Traits, _Allocator>& __str);
 
-#ifndef _LIBCPP_CXX03_LANG
-
 template<class _CharT, class _Traits, class _Allocator>
 inline _LIBCPP_INLINE_VISIBILITY
 basic_istream<_CharT, _Traits>&
@@ -4455,8 +4460,6 @@ basic_istream<_CharT, _Traits>&
 getline(basic_istream<_CharT, _Traits>&& __is,
         basic_string<_CharT, _Traits, _Allocator>& __str);
 
-#endif  // _LIBCPP_CXX03_LANG
-
 #if _LIBCPP_STD_VER > 17
 template <class _CharT, class _Traits, class _Allocator, class _Up>
 inline _LIBCPP_INLINE_VISIBILITY
@@ -4513,7 +4516,7 @@ basic_string<_CharT, _Traits, _Allocator>::__subscriptable(const const_iterator*
     return this->data() <= __p && __p < this->data() + this->size();
 }
 
-#endif  // _LIBCPP_DEBUG_LEVEL == 2
+#endif // _LIBCPP_DEBUG_LEVEL == 2
 
 #if _LIBCPP_STD_VER > 11
 // Literal suffixes for basic_string [basic.string.literals]
@@ -4533,7 +4536,7 @@ inline namespace literals
         return basic_string<wchar_t> (__str, __len);
     }
 
-#ifndef _LIBCPP_NO_HAS_CHAR8_T
+#ifndef _LIBCPP_HAS_NO_CHAR8_T
     inline _LIBCPP_INLINE_VISIBILITY
     basic_string<char8_t> operator "" s(const char8_t *__str, size_t __len) _NOEXCEPT
     {
@@ -4560,4 +4563,4 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
-#endif  // _LIBCPP_STRING
+#endif // _LIBCPP_STRING
-- 
cgit v1.2.3


From db4fea6689eb34959028cad3b63de45da65683d3 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Sat, 28 Aug 2021 13:11:47 -0700
Subject: update libcxx, libcxxabi, and C headers to release/13.x branch

upstream commit 9c49fee5e7ac0ca8bc4ec1c3738ca0d83df65852
---
 lib/include/__clang_cuda_device_functions.h        | 276 +++++++++++++--------
 lib/include/__clang_hip_cmath.h                    | 188 ++++++++------
 lib/include/__clang_hip_math.h                     |  50 +++-
 lib/include/intrin.h                               |   3 +
 .../__clang_openmp_device_functions.h              |  32 ++-
 lib/include/openmp_wrappers/cmath                  |  54 ++++
 lib/include/openmp_wrappers/math.h                 |  10 +
 lib/libcxx/include/cwctype                         |   2 +
 lib/libcxx/include/string                          |  19 ++
 lib/libcxx/include/vector                          |  20 ++
 lib/libcxx/include/wctype.h                        |  10 +
 lib/libcxxabi/src/cxa_personality.cpp              |   2 +-
 12 files changed, 476 insertions(+), 190 deletions(-)

(limited to 'lib/libcxx/include/string')

diff --git a/lib/include/__clang_cuda_device_functions.h b/lib/include/__clang_cuda_device_functions.h
index f801e5426a..cc4e1a4dd9 100644
--- a/lib/include/__clang_cuda_device_functions.h
+++ b/lib/include/__clang_cuda_device_functions.h
@@ -34,10 +34,12 @@ __DEVICE__ unsigned long long __brevll(unsigned long long __a) {
   return __nv_brevll(__a);
 }
 #if defined(__cplusplus)
-__DEVICE__ void __brkpt() { asm volatile("brkpt;"); }
+__DEVICE__ void __brkpt() { __asm__ __volatile__("brkpt;"); }
 __DEVICE__ void __brkpt(int __a) { __brkpt(); }
 #else
-__DEVICE__ void __attribute__((overloadable)) __brkpt(void) { asm volatile("brkpt;"); }
+__DEVICE__ void __attribute__((overloadable)) __brkpt(void) {
+  __asm__ __volatile__("brkpt;");
+}
 __DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); }
 #endif
 __DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b,
@@ -507,7 +509,7 @@ __DEVICE__ float __powf(float __a, float __b) {
 }
 
 // Parameter must have a known integer value.
-#define __prof_trigger(__a) asm __volatile__("pmevent \t%0;" ::"i"(__a))
+#define __prof_trigger(__a) __asm__ __volatile__("pmevent \t%0;" ::"i"(__a))
 __DEVICE__ int __rhadd(int __a, int __b) { return __nv_rhadd(__a, __b); }
 __DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) {
   return __nv_sad(__a, __b, __c);
@@ -526,7 +528,7 @@ __DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); }
 __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
 __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
 __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); };
-__DEVICE__ void __trap(void) { asm volatile("trap;"); }
+__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); }
 __DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) {
   return __nvvm_atom_add_gen_i((int *)__p, __v);
 }
@@ -1051,122 +1053,136 @@ __DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) {
 }
 __DEVICE__ unsigned int __vabs2(unsigned int __a) {
   unsigned int r;
-  asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(0), "r"(0));
+  __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(0), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vabs4(unsigned int __a) {
   unsigned int r;
-  asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(0), "r"(0));
+  __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(0), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 
 __DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vabsss2(unsigned int __a) {
   unsigned int r;
-  asm("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(0), "r"(0));
+  __asm__("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(0), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vabsss4(unsigned int __a) {
   unsigned int r;
-  asm("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(0), "r"(0));
+  __asm__("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(0), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vadd2.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd2.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vadd4.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd4.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vadd2.u32.u32.u32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd2.u32.u32.u32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vadd4.u32.u32.u32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd4.u32.u32.u32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vavrg2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vavrg2.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vavrg4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vavrg4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vavrg2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vavrg2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vavrg4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vavrg4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.eq %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
@@ -1174,7 +1190,9 @@ __DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.eq %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
@@ -1182,7 +1200,9 @@ __DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.s32.s32.ge %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
@@ -1190,7 +1210,9 @@ __DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.s32.s32.ge %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
@@ -1198,7 +1220,9 @@ __DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.ge %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
@@ -1206,7 +1230,9 @@ __DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.ge %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
@@ -1214,7 +1240,9 @@ __DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.s32.s32.gt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
@@ -1222,7 +1250,9 @@ __DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.s32.s32.gt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
@@ -1230,7 +1260,9 @@ __DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.gt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
@@ -1238,7 +1270,9 @@ __DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.gt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
@@ -1246,7 +1280,9 @@ __DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.s32.s32.le %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
@@ -1254,7 +1290,9 @@ __DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.s32.s32.le %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
@@ -1262,7 +1300,9 @@ __DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.le %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
@@ -1270,7 +1310,9 @@ __DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.le %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
@@ -1278,7 +1320,9 @@ __DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.s32.s32.lt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
@@ -1286,7 +1330,9 @@ __DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.s32.s32.lt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
@@ -1294,7 +1340,9 @@ __DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.lt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
@@ -1302,7 +1350,9 @@ __DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.lt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
@@ -1310,7 +1360,9 @@ __DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.ne %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
@@ -1318,7 +1370,9 @@ __DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.ne %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) {
@@ -1345,94 +1399,112 @@ __DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) {
     unsigned mask = __vcmpgts2(__a, __b);
     r = (__a & mask) | (__b & ~mask);
   } else {
-    asm("vmax2.s32.s32.s32 %0,%1,%2,%3;"
-        : "=r"(r)
-        : "r"(__a), "r"(__b), "r"(0));
+    __asm__("vmax2.s32.s32.s32 %0,%1,%2,%3;"
+            : "=r"(r)
+            : "r"(__a), "r"(__b), "r"(0));
   }
   return r;
 }
 __DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmax4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmax4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmax2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmax2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmax4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmax4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmin2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmin2.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmin4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmin4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmin2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmin2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vmin4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmin4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 
 __DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vsub2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); }
 
 __DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vsub4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vneg4(unsigned int __a) { return __vsub4(0, __a); }
 __DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vsub2.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub2.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vnegss2(unsigned int __a) {
@@ -1440,9 +1512,9 @@ __DEVICE__ unsigned int __vnegss2(unsigned int __a) {
 }
 __DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vsub4.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub4.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vnegss4(unsigned int __a) {
@@ -1450,16 +1522,16 @@ __DEVICE__ unsigned int __vnegss4(unsigned int __a) {
 }
 __DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vsub2.u32.u32.u32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub2.u32.u32.u32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
   unsigned int r;
-  asm("vsub4.u32.u32.u32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub4.u32.u32.u32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
   return r;
 }
 #endif // CUDA_VERSION >= 9020
diff --git a/lib/include/__clang_hip_cmath.h b/lib/include/__clang_hip_cmath.h
index 7342705434..d488db0a94 100644
--- a/lib/include/__clang_hip_cmath.h
+++ b/lib/include/__clang_hip_cmath.h
@@ -10,7 +10,7 @@
 #ifndef __CLANG_HIP_CMATH_H__
 #define __CLANG_HIP_CMATH_H__
 
-#if !defined(__HIP__)
+#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
 #error "This file is for HIP and OpenMP AMDGCN device compilation only."
 #endif
 
@@ -25,31 +25,43 @@
 #endif // !defined(__HIPCC_RTC__)
 
 #pragma push_macro("__DEVICE__")
+#pragma push_macro("__CONSTEXPR__")
+#ifdef __OPENMP_AMDGCN__
+#define __DEVICE__ static __attribute__((always_inline, nothrow))
+#define __CONSTEXPR__ constexpr
+#else
 #define __DEVICE__ static __device__ inline __attribute__((always_inline))
+#define __CONSTEXPR__
+#endif // __OPENMP_AMDGCN__
 
 // Start with functions that cannot be defined by DEF macros below.
 #if defined(__cplusplus)
-__DEVICE__ double abs(double __x) { return ::fabs(__x); }
-__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
-__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
-__DEVICE__ long abs(long __n) { return ::labs(__n); }
-__DEVICE__ float fma(float __x, float __y, float __z) {
+#if defined __OPENMP_AMDGCN__
+__DEVICE__ __CONSTEXPR__ float fabs(float __x) { return ::fabsf(__x); }
+__DEVICE__ __CONSTEXPR__ float sin(float __x) { return ::sinf(__x); }
+__DEVICE__ __CONSTEXPR__ float cos(float __x) { return ::cosf(__x); }
+#endif
+__DEVICE__ __CONSTEXPR__ double abs(double __x) { return ::fabs(__x); }
+__DEVICE__ __CONSTEXPR__ float abs(float __x) { return ::fabsf(__x); }
+__DEVICE__ __CONSTEXPR__ long long abs(long long __n) { return ::llabs(__n); }
+__DEVICE__ __CONSTEXPR__ long abs(long __n) { return ::labs(__n); }
+__DEVICE__ __CONSTEXPR__ float fma(float __x, float __y, float __z) {
   return ::fmaf(__x, __y, __z);
 }
 #if !defined(__HIPCC_RTC__)
 // The value returned by fpclassify is platform dependent, therefore it is not
 // supported by hipRTC.
-__DEVICE__ int fpclassify(float __x) {
+__DEVICE__ __CONSTEXPR__ int fpclassify(float __x) {
   return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                               FP_ZERO, __x);
 }
-__DEVICE__ int fpclassify(double __x) {
+__DEVICE__ __CONSTEXPR__ int fpclassify(double __x) {
   return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                               FP_ZERO, __x);
 }
 #endif // !defined(__HIPCC_RTC__)
 
-__DEVICE__ float frexp(float __arg, int *__exp) {
+__DEVICE__ __CONSTEXPR__ float frexp(float __arg, int *__exp) {
   return ::frexpf(__arg, __exp);
 }
 
@@ -71,93 +83,101 @@ __DEVICE__ float frexp(float __arg, int *__exp) {
 //        of the variants inside the inner region and avoid the clash.
 #pragma omp begin declare variant match(implementation = {vendor(llvm)})
 
-__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
-__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
-__DEVICE__ int isfinite(double __x) { return ::__finite(__x); }
-__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
+__DEVICE__ __CONSTEXPR__ int isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ __CONSTEXPR__ int isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ __CONSTEXPR__ int isfinite(float __x) { return ::__finitef(__x); }
+__DEVICE__ __CONSTEXPR__ int isfinite(double __x) { return ::__finite(__x); }
+__DEVICE__ __CONSTEXPR__ int isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ __CONSTEXPR__ int isnan(double __x) { return ::__isnan(__x); }
 
 #pragma omp end declare variant
 #endif // defined(__OPENMP_AMDGCN__)
 
-__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
-__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
-__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
-__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+__DEVICE__ __CONSTEXPR__ bool isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ __CONSTEXPR__ bool isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ __CONSTEXPR__ bool isfinite(float __x) { return ::__finitef(__x); }
+__DEVICE__ __CONSTEXPR__ bool isfinite(double __x) { return ::__finite(__x); }
+__DEVICE__ __CONSTEXPR__ bool isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ __CONSTEXPR__ bool isnan(double __x) { return ::__isnan(__x); }
 
 #if defined(__OPENMP_AMDGCN__)
 #pragma omp end declare variant
 #endif // defined(__OPENMP_AMDGCN__)
 
-__DEVICE__ bool isgreater(float __x, float __y) {
+__DEVICE__ __CONSTEXPR__ bool isgreater(float __x, float __y) {
   return __builtin_isgreater(__x, __y);
 }
-__DEVICE__ bool isgreater(double __x, double __y) {
+__DEVICE__ __CONSTEXPR__ bool isgreater(double __x, double __y) {
   return __builtin_isgreater(__x, __y);
 }
-__DEVICE__ bool isgreaterequal(float __x, float __y) {
+__DEVICE__ __CONSTEXPR__ bool isgreaterequal(float __x, float __y) {
   return __builtin_isgreaterequal(__x, __y);
 }
-__DEVICE__ bool isgreaterequal(double __x, double __y) {
+__DEVICE__ __CONSTEXPR__ bool isgreaterequal(double __x, double __y) {
   return __builtin_isgreaterequal(__x, __y);
 }
-__DEVICE__ bool isless(float __x, float __y) {
+__DEVICE__ __CONSTEXPR__ bool isless(float __x, float __y) {
   return __builtin_isless(__x, __y);
 }
-__DEVICE__ bool isless(double __x, double __y) {
+__DEVICE__ __CONSTEXPR__ bool isless(double __x, double __y) {
   return __builtin_isless(__x, __y);
 }
-__DEVICE__ bool islessequal(float __x, float __y) {
+__DEVICE__ __CONSTEXPR__ bool islessequal(float __x, float __y) {
   return __builtin_islessequal(__x, __y);
 }
-__DEVICE__ bool islessequal(double __x, double __y) {
+__DEVICE__ __CONSTEXPR__ bool islessequal(double __x, double __y) {
   return __builtin_islessequal(__x, __y);
 }
-__DEVICE__ bool islessgreater(float __x, float __y) {
+__DEVICE__ __CONSTEXPR__ bool islessgreater(float __x, float __y) {
   return __builtin_islessgreater(__x, __y);
 }
-__DEVICE__ bool islessgreater(double __x, double __y) {
+__DEVICE__ __CONSTEXPR__ bool islessgreater(double __x, double __y) {
   return __builtin_islessgreater(__x, __y);
 }
-__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
-__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
-__DEVICE__ bool isunordered(float __x, float __y) {
+__DEVICE__ __CONSTEXPR__ bool isnormal(float __x) {
+  return __builtin_isnormal(__x);
+}
+__DEVICE__ __CONSTEXPR__ bool isnormal(double __x) {
+  return __builtin_isnormal(__x);
+}
+__DEVICE__ __CONSTEXPR__ bool isunordered(float __x, float __y) {
   return __builtin_isunordered(__x, __y);
 }
-__DEVICE__ bool isunordered(double __x, double __y) {
+__DEVICE__ __CONSTEXPR__ bool isunordered(double __x, double __y) {
   return __builtin_isunordered(__x, __y);
 }
-__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
-__DEVICE__ float pow(float __base, int __iexp) {
+__DEVICE__ __CONSTEXPR__ float modf(float __x, float *__iptr) {
+  return ::modff(__x, __iptr);
+}
+__DEVICE__ __CONSTEXPR__ float pow(float __base, int __iexp) {
   return ::powif(__base, __iexp);
 }
-__DEVICE__ double pow(double __base, int __iexp) {
+__DEVICE__ __CONSTEXPR__ double pow(double __base, int __iexp) {
   return ::powi(__base, __iexp);
 }
-__DEVICE__ float remquo(float __x, float __y, int *__quo) {
+__DEVICE__ __CONSTEXPR__ float remquo(float __x, float __y, int *__quo) {
   return ::remquof(__x, __y, __quo);
 }
-__DEVICE__ float scalbln(float __x, long int __n) {
+__DEVICE__ __CONSTEXPR__ float scalbln(float __x, long int __n) {
   return ::scalblnf(__x, __n);
 }
-__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
-__DEVICE__ bool signbit(double __x) { return ::__signbit(__x); }
+__DEVICE__ __CONSTEXPR__ bool signbit(float __x) { return ::__signbitf(__x); }
+__DEVICE__ __CONSTEXPR__ bool signbit(double __x) { return ::__signbit(__x); }
 
 // Notably missing above is nexttoward.  We omit it because
 // ocml doesn't provide an implementation, and we don't want to be in the
 // business of implementing tricky libm functions in this header.
 
 // Other functions.
-__DEVICE__ _Float16 fma(_Float16 __x, _Float16 __y, _Float16 __z) {
+__DEVICE__ __CONSTEXPR__ _Float16 fma(_Float16 __x, _Float16 __y,
+                                      _Float16 __z) {
   return __ocml_fma_f16(__x, __y, __z);
 }
-__DEVICE__ _Float16 pow(_Float16 __base, int __iexp) {
+__DEVICE__ __CONSTEXPR__ _Float16 pow(_Float16 __base, int __iexp) {
   return __ocml_pown_f16(__base, __iexp);
 }
 
+#ifndef __OPENMP_AMDGCN__
 // BEGIN DEF_FUN and HIP_OVERLOAD
 
 // BEGIN DEF_FUN
@@ -168,18 +188,19 @@ __DEVICE__ _Float16 pow(_Float16 __base, int __iexp) {
 
 // Define cmath functions with float argument and returns __retty.
 #define __DEF_FUN1(__retty, __func)                                            \
-  __DEVICE__                                                                   \
-  __retty __func(float __x) { return __func##f(__x); }
+  __DEVICE__ __CONSTEXPR__ __retty __func(float __x) { return __func##f(__x); }
 
 // Define cmath functions with two float arguments and returns __retty.
 #define __DEF_FUN2(__retty, __func)                                            \
-  __DEVICE__                                                                   \
-  __retty __func(float __x, float __y) { return __func##f(__x, __y); }
+  __DEVICE__ __CONSTEXPR__ __retty __func(float __x, float __y) {              \
+    return __func##f(__x, __y);                                                \
+  }
 
 // Define cmath functions with a float and an int argument and returns __retty.
 #define __DEF_FUN2_FI(__retty, __func)                                         \
-  __DEVICE__                                                                   \
-  __retty __func(float __x, int __y) { return __func##f(__x, __y); }
+  __DEVICE__ __CONSTEXPR__ __retty __func(float __x, int __y) {                \
+    return __func##f(__x, __y);                                                \
+  }
 
 __DEF_FUN1(float, acos)
 __DEF_FUN1(float, acosh)
@@ -426,7 +447,7 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {};
 // floor(double).
 #define __HIP_OVERLOAD1(__retty, __fn)                                         \
   template <typename __T>                                                      \
-  __DEVICE__                                                                   \
+  __DEVICE__ __CONSTEXPR__                                                     \
       typename __hip_enable_if<__hip::is_integral<__T>::value, __retty>::type  \
       __fn(__T __x) {                                                          \
     return ::__fn((double)__x);                                                \
@@ -438,7 +459,7 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {};
 #if __cplusplus >= 201103L
 #define __HIP_OVERLOAD2(__retty, __fn)                                         \
   template <typename __T1, typename __T2>                                      \
-  __DEVICE__ typename __hip_enable_if<                                         \
+  __DEVICE__ __CONSTEXPR__ typename __hip_enable_if<                           \
       __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value,  \
       typename __hip::__promote<__T1, __T2>::type>::type                       \
   __fn(__T1 __x, __T2 __y) {                                                   \
@@ -448,10 +469,11 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {};
 #else
 #define __HIP_OVERLOAD2(__retty, __fn)                                         \
   template <typename __T1, typename __T2>                                      \
-  __DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&     \
-                                          __hip::is_arithmetic<__T2>::value,   \
-                                      __retty>::type                           \
-  __fn(__T1 __x, __T2 __y) {                                                   \
+  __DEVICE__ __CONSTEXPR__                                                     \
+      typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&            \
+                                   __hip::is_arithmetic<__T2>::value,          \
+                               __retty>::type                                  \
+      __fn(__T1 __x, __T2 __y) {                                               \
     return __fn((double)__x, (double)__y);                                     \
   }
 #endif
@@ -526,7 +548,7 @@ __HIP_OVERLOAD2(double, min)
 // Additional Overloads that don't quite match HIP_OVERLOAD.
 #if __cplusplus >= 201103L
 template <typename __T1, typename __T2, typename __T3>
-__DEVICE__ typename __hip_enable_if<
+__DEVICE__ __CONSTEXPR__ typename __hip_enable_if<
     __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value &&
         __hip::is_arithmetic<__T3>::value,
     typename __hip::__promote<__T1, __T2, __T3>::type>::type
@@ -536,31 +558,32 @@ fma(__T1 __x, __T2 __y, __T3 __z) {
 }
 #else
 template <typename __T1, typename __T2, typename __T3>
-__DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
-                                        __hip::is_arithmetic<__T2>::value &&
-                                        __hip::is_arithmetic<__T3>::value,
-                                    double>::type
-fma(__T1 __x, __T2 __y, __T3 __z) {
+__DEVICE__ __CONSTEXPR__
+    typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
+                                 __hip::is_arithmetic<__T2>::value &&
+                                 __hip::is_arithmetic<__T3>::value,
+                             double>::type
+    fma(__T1 __x, __T2 __y, __T3 __z) {
   return ::fma((double)__x, (double)__y, (double)__z);
 }
 #endif
 
 template <typename __T>
-__DEVICE__
+__DEVICE__ __CONSTEXPR__
     typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
     frexp(__T __x, int *__exp) {
   return ::frexp((double)__x, __exp);
 }
 
 template <typename __T>
-__DEVICE__
+__DEVICE__ __CONSTEXPR__
     typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
     ldexp(__T __x, int __exp) {
   return ::ldexp((double)__x, __exp);
 }
 
 template <typename __T>
-__DEVICE__
+__DEVICE__ __CONSTEXPR__
     typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
     modf(__T __x, double *__exp) {
   return ::modf((double)__x, __exp);
@@ -568,7 +591,7 @@ __DEVICE__
 
 #if __cplusplus >= 201103L
 template <typename __T1, typename __T2>
-__DEVICE__
+__DEVICE__ __CONSTEXPR__
     typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
                                  __hip::is_arithmetic<__T2>::value,
                              typename __hip::__promote<__T1, __T2>::type>::type
@@ -578,23 +601,24 @@ __DEVICE__
 }
 #else
 template <typename __T1, typename __T2>
-__DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
-                                        __hip::is_arithmetic<__T2>::value,
-                                    double>::type
-remquo(__T1 __x, __T2 __y, int *__quo) {
+__DEVICE__ __CONSTEXPR__
+    typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
+                                 __hip::is_arithmetic<__T2>::value,
+                             double>::type
+    remquo(__T1 __x, __T2 __y, int *__quo) {
   return ::remquo((double)__x, (double)__y, __quo);
 }
 #endif
 
 template <typename __T>
-__DEVICE__
+__DEVICE__ __CONSTEXPR__
     typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
     scalbln(__T __x, long int __exp) {
   return ::scalbln((double)__x, __exp);
 }
 
 template <typename __T>
-__DEVICE__
+__DEVICE__ __CONSTEXPR__
     typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
     scalbn(__T __x, int __exp) {
   return ::scalbn((double)__x, __exp);
@@ -607,8 +631,10 @@ __DEVICE__
 
 // END DEF_FUN and HIP_OVERLOAD
 
+#endif // ifndef __OPENMP_AMDGCN__
 #endif // defined(__cplusplus)
 
+#ifndef __OPENMP_AMDGCN__
 // Define these overloads inside the namespace our standard library uses.
 #if !defined(__HIPCC_RTC__)
 #ifdef _LIBCPP_BEGIN_NAMESPACE_STD
@@ -781,22 +807,26 @@ _GLIBCXX_END_NAMESPACE_VERSION
 #if defined(__cplusplus)
 extern "C" {
 #endif // defined(__cplusplus)
-__DEVICE__ __attribute__((overloadable)) double _Cosh(double x, double y) {
+__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) double _Cosh(double x,
+                                                                    double y) {
   return cosh(x) * y;
 }
-__DEVICE__ __attribute__((overloadable)) float _FCosh(float x, float y) {
+__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) float _FCosh(float x,
+                                                                    float y) {
   return coshf(x) * y;
 }
-__DEVICE__ __attribute__((overloadable)) short _Dtest(double *p) {
+__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) short _Dtest(double *p) {
   return fpclassify(*p);
 }
-__DEVICE__ __attribute__((overloadable)) short _FDtest(float *p) {
+__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) short _FDtest(float *p) {
   return fpclassify(*p);
 }
-__DEVICE__ __attribute__((overloadable)) double _Sinh(double x, double y) {
+__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) double _Sinh(double x,
+                                                                    double y) {
   return sinh(x) * y;
 }
-__DEVICE__ __attribute__((overloadable)) float _FSinh(float x, float y) {
+__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) float _FSinh(float x,
+                                                                    float y) {
   return sinhf(x) * y;
 }
 #if defined(__cplusplus)
@@ -804,7 +834,9 @@ __DEVICE__ __attribute__((overloadable)) float _FSinh(float x, float y) {
 #endif // defined(__cplusplus)
 #endif // defined(_MSC_VER)
 #endif // !defined(__HIPCC_RTC__)
+#endif // ifndef __OPENMP_AMDGCN__
 
 #pragma pop_macro("__DEVICE__")
+#pragma pop_macro("__CONSTEXPR__")
 
 #endif // __CLANG_HIP_CMATH_H__
diff --git a/lib/include/__clang_hip_math.h b/lib/include/__clang_hip_math.h
index 1f0982d92e..ef7e087b83 100644
--- a/lib/include/__clang_hip_math.h
+++ b/lib/include/__clang_hip_math.h
@@ -9,7 +9,7 @@
 #ifndef __CLANG_HIP_MATH_H__
 #define __CLANG_HIP_MATH_H__
 
-#if !defined(__HIP__)
+#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
 #error "This file is for HIP and OpenMP AMDGCN device compilation only."
 #endif
 
@@ -19,18 +19,30 @@
 #endif
 #include <limits.h>
 #include <stdint.h>
-#endif // __HIPCC_RTC__
+#ifdef __OPENMP_AMDGCN__
+#include <omp.h>
+#endif
+#endif // !defined(__HIPCC_RTC__)
 
 #pragma push_macro("__DEVICE__")
+
+#ifdef __OPENMP_AMDGCN__
+#define __DEVICE__ static inline __attribute__((always_inline, nothrow))
+#else
 #define __DEVICE__ static __device__ inline __attribute__((always_inline))
+#endif
 
 // A few functions return bool type starting only in C++11.
 #pragma push_macro("__RETURN_TYPE")
+#ifdef __OPENMP_AMDGCN__
+#define __RETURN_TYPE int
+#else
 #if defined(__cplusplus)
 #define __RETURN_TYPE bool
 #else
 #define __RETURN_TYPE int
 #endif
+#endif // __OPENMP_AMDGCN__
 
 #if defined (__cplusplus) && __cplusplus < 201103L
 // emulate static_assert on type sizes
@@ -249,6 +261,9 @@ float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
 __DEVICE__
 float frexpf(float __x, int *__nptr) {
   int __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   float __r =
       __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
   *__nptr = __tmp;
@@ -334,6 +349,9 @@ long int lroundf(float __x) { return __ocml_round_f32(__x); }
 __DEVICE__
 float modff(float __x, float *__iptr) {
   float __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   float __r =
       __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
   *__iptr = __tmp;
@@ -414,6 +432,9 @@ float remainderf(float __x, float __y) {
 __DEVICE__
 float remquof(float __x, float __y, int *__quo) {
   int __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   float __r = __ocml_remquo_f32(
       __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
   *__quo = __tmp;
@@ -470,6 +491,9 @@ __RETURN_TYPE __signbitf(float __x) { return __ocml_signbit_f32(__x); }
 __DEVICE__
 void sincosf(float __x, float *__sinptr, float *__cosptr) {
   float __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   *__sinptr =
       __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
   *__cosptr = __tmp;
@@ -478,6 +502,9 @@ void sincosf(float __x, float *__sinptr, float *__cosptr) {
 __DEVICE__
 void sincospif(float __x, float *__sinptr, float *__cosptr) {
   float __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   *__sinptr = __ocml_sincospi_f32(
       __x, (__attribute__((address_space(5))) float *)&__tmp);
   *__cosptr = __tmp;
@@ -790,6 +817,9 @@ double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
 __DEVICE__
 double frexp(double __x, int *__nptr) {
   int __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   double __r =
       __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
   *__nptr = __tmp;
@@ -874,6 +904,9 @@ long int lround(double __x) { return __ocml_round_f64(__x); }
 __DEVICE__
 double modf(double __x, double *__iptr) {
   double __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   double __r =
       __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
   *__iptr = __tmp;
@@ -962,6 +995,9 @@ double remainder(double __x, double __y) {
 __DEVICE__
 double remquo(double __x, double __y, int *__quo) {
   int __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   double __r = __ocml_remquo_f64(
       __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
   *__quo = __tmp;
@@ -1020,6 +1056,9 @@ double sin(double __x) { return __ocml_sin_f64(__x); }
 __DEVICE__
 void sincos(double __x, double *__sinptr, double *__cosptr) {
   double __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   *__sinptr = __ocml_sincos_f64(
       __x, (__attribute__((address_space(5))) double *)&__tmp);
   *__cosptr = __tmp;
@@ -1028,6 +1067,9 @@ void sincos(double __x, double *__sinptr, double *__cosptr) {
 __DEVICE__
 void sincospi(double __x, double *__sinptr, double *__cosptr) {
   double __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   *__sinptr = __ocml_sincospi_f64(
       __x, (__attribute__((address_space(5))) double *)&__tmp);
   *__cosptr = __tmp;
@@ -1262,7 +1304,7 @@ float min(float __x, float __y) { return fminf(__x, __y); }
 __DEVICE__
 double min(double __x, double __y) { return fmin(__x, __y); }
 
-#if !defined(__HIPCC_RTC__)
+#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 __host__ inline static int min(int __arg1, int __arg2) {
   return std::min(__arg1, __arg2);
 }
@@ -1270,7 +1312,7 @@ __host__ inline static int min(int __arg1, int __arg2) {
 __host__ inline static int max(int __arg1, int __arg2) {
   return std::max(__arg1, __arg2);
 }
-#endif // __HIPCC_RTC__
+#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 #endif
 
 #pragma pop_macro("__DEVICE__")
diff --git a/lib/include/intrin.h b/lib/include/intrin.h
index ff8eb8fca2..34ec79d6ac 100644
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@@ -574,6 +574,9 @@ void _WriteStatusReg(int, __int64);
 unsigned short __cdecl _byteswap_ushort(unsigned short val);
 unsigned long __cdecl _byteswap_ulong (unsigned long val);
 unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64 val);
+
+__int64 __mulh(__int64 __a, __int64 __b);
+unsigned __int64 __umulh(unsigned __int64 __a, unsigned __int64 __b);
 #endif
 
 /*----------------------------------------------------------------------------*\
diff --git a/lib/include/openmp_wrappers/__clang_openmp_device_functions.h b/lib/include/openmp_wrappers/__clang_openmp_device_functions.h
index 953857badf..279fb26fba 100644
--- a/lib/include/openmp_wrappers/__clang_openmp_device_functions.h
+++ b/lib/include/openmp_wrappers/__clang_openmp_device_functions.h
@@ -14,13 +14,13 @@
 #error "This file is for OpenMP compilation only."
 #endif
 
-#pragma omp begin declare variant match(                                       \
-    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
 #define __CUDA__
 #define __OPENMP_NVPTX__
 
@@ -33,11 +33,33 @@ extern "C" {
 #undef __OPENMP_NVPTX__
 #undef __CUDA__
 
-#ifdef __cplusplus
-} // extern "C"
+#pragma omp end declare variant
+
+#ifdef __AMDGCN__
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Import types which will be used by __clang_hip_libdevice_declares.h
+#ifndef __cplusplus
+#include <stdbool.h>
+#include <stdint.h>
 #endif
 
+#define __OPENMP_AMDGCN__
+#pragma push_macro("__device__")
+#define __device__
+
+/// Include declarations for libdevice functions.
+#include <__clang_hip_libdevice_declares.h>
+
+#pragma pop_macro("__device__")
+#undef __OPENMP_AMDGCN__
+
 #pragma omp end declare variant
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
 
 // Ensure we make `_ZdlPv`, aka. `operator delete(void*)` available without the
 // need to `include <new>` in C++ mode.
diff --git a/lib/include/openmp_wrappers/cmath b/lib/include/openmp_wrappers/cmath
index 1aff66af7d..22a720aca9 100644
--- a/lib/include/openmp_wrappers/cmath
+++ b/lib/include/openmp_wrappers/cmath
@@ -75,4 +75,58 @@ __DEVICE__ float tgamma(float __x) { return ::tgammaf(__x); }
 
 #pragma omp end declare variant
 
+#ifdef __AMDGCN__
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+#pragma push_macro("__constant__")
+#define __constant__ __attribute__((constant))
+#define __OPENMP_AMDGCN__
+
+#include <__clang_hip_cmath.h>
+
+#pragma pop_macro("__constant__")
+#undef __OPENMP_AMDGCN__
+
+// Define overloads otherwise which are absent
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
+
+__DEVICE__ float acos(float __x) { return ::acosf(__x); }
+__DEVICE__ float acosh(float __x) { return ::acoshf(__x); }
+__DEVICE__ float asin(float __x) { return ::asinf(__x); }
+__DEVICE__ float asinh(float __x) { return ::asinhf(__x); }
+__DEVICE__ float atan(float __x) { return ::atanf(__x); }
+__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
+__DEVICE__ float atanh(float __x) { return ::atanhf(__x); }
+__DEVICE__ float cbrt(float __x) { return ::cbrtf(__x); }
+__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
+__DEVICE__ float erf(float __x) { return ::erff(__x); }
+__DEVICE__ float erfc(float __x) { return ::erfcf(__x); }
+__DEVICE__ float exp2(float __x) { return ::exp2f(__x); }
+__DEVICE__ float expm1(float __x) { return ::expm1f(__x); }
+__DEVICE__ float fdim(float __x, float __y) { return ::fdimf(__x, __y); }
+__DEVICE__ float hypot(float __x, float __y) { return ::hypotf(__x, __y); }
+__DEVICE__ int ilogb(float __x) { return ::ilogbf(__x); }
+__DEVICE__ float ldexp(float __arg, int __exp) {
+  return ::ldexpf(__arg, __exp);
+}
+__DEVICE__ float lgamma(float __x) { return ::lgammaf(__x); }
+__DEVICE__ float log1p(float __x) { return ::log1pf(__x); }
+__DEVICE__ float logb(float __x) { return ::logbf(__x); }
+__DEVICE__ float nextafter(float __x, float __y) {
+  return ::nextafterf(__x, __y);
+}
+__DEVICE__ float remainder(float __x, float __y) {
+  return ::remainderf(__x, __y);
+}
+__DEVICE__ float scalbn(float __x, int __y) { return ::scalbnf(__x, __y); }
+__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
+__DEVICE__ float tan(float __x) { return ::tanf(__x); }
+__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
+__DEVICE__ float tgamma(float __x) { return ::tgammaf(__x); }
+
+#undef __DEVICE__
+
+#pragma omp end declare variant
+#endif // __AMDGCN__
+
 #endif
diff --git a/lib/include/openmp_wrappers/math.h b/lib/include/openmp_wrappers/math.h
index c64af8b13e..1e3c07cfdb 100644
--- a/lib/include/openmp_wrappers/math.h
+++ b/lib/include/openmp_wrappers/math.h
@@ -48,4 +48,14 @@
 
 #pragma omp end declare variant
 
+#ifdef __AMDGCN__
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+#define __OPENMP_AMDGCN__
+#include <__clang_hip_math.h>
+#undef __OPENMP_AMDGCN__
+
+#pragma omp end declare variant
+#endif
+
 #endif
diff --git a/lib/libcxx/include/cwctype b/lib/libcxx/include/cwctype
index 17c68d6d45..27eea2f157 100644
--- a/lib/libcxx/include/cwctype
+++ b/lib/libcxx/include/cwctype
@@ -59,6 +59,7 @@ wctrans_t wctrans(const char* property);
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+#if defined(_LIBCPP_INCLUDED_C_LIBRARY_WCTYPE_H)
 using ::wint_t _LIBCPP_USING_IF_EXISTS;
 using ::wctrans_t _LIBCPP_USING_IF_EXISTS;
 using ::wctype_t _LIBCPP_USING_IF_EXISTS;
@@ -80,6 +81,7 @@ using ::towlower _LIBCPP_USING_IF_EXISTS;
 using ::towupper _LIBCPP_USING_IF_EXISTS;
 using ::towctrans _LIBCPP_USING_IF_EXISTS;
 using ::wctrans _LIBCPP_USING_IF_EXISTS;
+#endif // _LIBCPP_INCLUDED_C_LIBRARY_WCTYPE_H
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/lib/libcxx/include/string b/lib/libcxx/include/string
index 4940021b0c..4159ea5803 100644
--- a/lib/libcxx/include/string
+++ b/lib/libcxx/include/string
@@ -522,6 +522,7 @@ basic_string<char32_t> operator "" s( const char32_t *str, size_t len ); // C++1
 #include <algorithm>
 #include <compare>
 #include <cstdio>  // EOF
+#include <cstdlib>
 #include <cstring>
 #include <cwchar>
 #include <initializer_list>
@@ -1714,6 +1715,24 @@ private:
         return data() <= __p && __p <= data() + size();
     }
 
+    _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI
+    void __throw_length_error() const {
+#ifndef _LIBCPP_NO_EXCEPTIONS
+        __basic_string_common<true>::__throw_length_error();
+#else
+        _VSTD::abort();
+#endif
+    }
+
+    _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI
+    void __throw_out_of_range() const {
+#ifndef _LIBCPP_NO_EXCEPTIONS
+        __basic_string_common<true>::__throw_out_of_range();
+#else
+        _VSTD::abort();
+#endif
+    }
+
     friend basic_string operator+<>(const basic_string&, const basic_string&);
     friend basic_string operator+<>(const value_type*, const basic_string&);
     friend basic_string operator+<>(value_type, const basic_string&);
diff --git a/lib/libcxx/include/vector b/lib/libcxx/include/vector
index 9189ed44a8..90d8b946f1 100644
--- a/lib/libcxx/include/vector
+++ b/lib/libcxx/include/vector
@@ -281,6 +281,7 @@ erase_if(vector<T, Allocator>& c, Predicate pred);    // C++20
 #include <algorithm>
 #include <climits>
 #include <compare>
+#include <cstdlib>
 #include <cstring>
 #include <initializer_list>
 #include <iosfwd> // for forward declaration of vector
@@ -390,6 +391,25 @@ protected:
             is_nothrow_move_assignable<allocator_type>::value)
         {__move_assign_alloc(__c, integral_constant<bool,
                       __alloc_traits::propagate_on_container_move_assignment::value>());}
+
+    _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI
+    void __throw_length_error() const {
+#ifndef _LIBCPP_NO_EXCEPTIONS
+        __vector_base_common<true>::__throw_length_error();
+#else
+        _VSTD::abort();
+#endif
+    }
+
+    _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI
+    void __throw_out_of_range() const {
+#ifndef _LIBCPP_NO_EXCEPTIONS
+        __vector_base_common<true>::__throw_out_of_range();
+#else
+        _VSTD::abort();
+#endif
+    }
+
 private:
     _LIBCPP_INLINE_VISIBILITY
     void __copy_assign_alloc(const __vector_base& __c, true_type)
diff --git a/lib/libcxx/include/wctype.h b/lib/libcxx/include/wctype.h
index 1b4b146149..3b614759ac 100644
--- a/lib/libcxx/include/wctype.h
+++ b/lib/libcxx/include/wctype.h
@@ -50,8 +50,18 @@ wctrans_t wctrans(const char* property);
 #pragma GCC system_header
 #endif
 
+// TODO:
+// In the future, we should unconditionally include_next <wctype.h> here and instead
+// have a mode under which the library does not need libc++'s <wctype.h> or <cwctype>
+// at all (i.e. a mode without wchar_t). As it stands, we need to do that to completely
+// bypass the using declarations in <cwctype> when we did not include <wctype.h>.
+// Otherwise, a using declaration like `using ::wint_t` in <cwctype> will refer to
+// nothing (with using_if_exists), and if we include another header that defines one
+// of these declarations (e.g. <wchar.h>), the second `using ::wint_t` with using_if_exists
+// will fail because it does not refer to the same declaration.
 #if __has_include_next(<wctype.h>)
 #   include_next <wctype.h>
+#   define _LIBCPP_INCLUDED_C_LIBRARY_WCTYPE_H
 #endif
 
 #ifdef __cplusplus
diff --git a/lib/libcxxabi/src/cxa_personality.cpp b/lib/libcxxabi/src/cxa_personality.cpp
index a4f81d7473..91b584eb8c 100644
--- a/lib/libcxxabi/src/cxa_personality.cpp
+++ b/lib/libcxxabi/src/cxa_personality.cpp
@@ -702,10 +702,10 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
                 return;
             }
             landingPad = (uintptr_t)lpStart + landingPad;
-            results.landingPad = landingPad;
 #else  // __USING_SJLJ_EXCEPTIONS__
             ++landingPad;
 #endif // __USING_SJLJ_EXCEPTIONS__
+            results.landingPad = landingPad;
             if (actionEntry == 0)
             {
                 // Found a cleanup
-- 
cgit v1.2.3