PerformanceTests/StitchMarker/ogiroux/lib/semaphore.cpp - WebKit - Git at Google

 /*

 Copyright (c) 2017, NVIDIA Corporation
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:

 1. Redistributions of source code must retain the above copyright notice, this
 list of conditions and the following disclaimer.

 2. Redistributions in binary form must reproduce the above copyright notice,
 this list of conditions and the following disclaimer in the documentation
 and/or other materials provided with the distribution.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 OF THE POSSIBILITY OF SUCH DAMAGE.

 */
 #ifdef __semaphore_cuda
 namespace cuda
 {
 #else
 #include "semaphore"
 namespace std
 {
 #endif //__semaphore_cuda

 namespace experimental
 {
 inline namespace v1 {
 namespace details
 {
 #ifdef __semaphore_cuda

 #else

 #ifdef __linux__
 // On Linux, we make use of the kernel memory wait operations. These have been available for a long time.
 template <class A>
 inline const void *__semaphore_fixalign(A &a)
 {
     static_assert(sizeof(A) <= sizeof(int), "Linux only supports 'int' for Futex.");
     return (const void *)((intptr_t)&a & ~(sizeof(int) - 1));
 }
 inline int __semaphore_readint(const void *p)
 {
     int i;
     memcpy(&i, p, sizeof(int));
     return i;
 }
 template <class A, class V>
 inline void __semaphore_wait(A &a, V v)
 {
     auto p = __semaphore_fixalign(a);
     auto i = __semaphore_readint(p);
     asm volatile("" ::
                      : "memory");
     if (a.load(memory_order_relaxed) != v)
         return;
     syscall(SYS_futex, p, FUTEX_WAIT_PRIVATE, i, 0, 0, 0);
 }
 template <class A, class V, class Rep, class Period>
 void __semaphore_wait_timed(A &a, V v, const chrono::duration<Rep, Period> &t)
 {
     auto p = __semaphore_fixalign(a);
     auto i = __semaphore_readint(p);
     asm volatile("" ::
                      : "memory");
     if (a.load(memory_order_relaxed) != v)
         return;
     syscall(SYS_futex, p, FUTEX_WAIT_PRIVATE, i, __semaphore_to_timespec(t), 0, 0);
 }
 template <class A>
 inline void __semaphore_wake_one(A &a)
 {
     syscall(SYS_futex, __semaphore_fixalign(a), FUTEX_WAKE_PRIVATE, 1, 0, 0, 0);
 }
 template <class A>
 inline void __semaphore_wake_all(A &a)
 {
     syscall(SYS_futex, __semaphore_fixalign(a), FUTEX_WAKE_PRIVATE, INT_MAX, 0, 0, 0);
 }
 template <class A, class V>
 inline void __semaphore_wait(volatile A &a, V v)
 {
     auto p = __semaphore_fixalign(a);
     auto i = __semaphore_readint(p);
     asm volatile("" ::
                      : "memory");
     if (a.load(memory_order_relaxed) != v)
         return;
     syscall(SYS_futex, p, FUTEX_WAIT, i, 0, 0, 0);
 }
 template <class A, class V, class Rep, class Period>
 void __semaphore_wait_timed(volatile A &a, V v, const chrono::duration<Rep, Period> &t)
 {
     auto p = __semaphore_fixalign(a);
     auto i = __semaphore_readint(p);
     asm volatile("" ::
                      : "memory");
     if (a.load(memory_order_relaxed) != v)
         return;
     syscall(SYS_futex, p, FUTEX_WAIT, i, details::__semaphore_to_timespec(t), 0, 0);
 }
 template <class A>
 inline void __semaphore_wake_one(volatile A &a)
 {
     syscall(SYS_futex, __semaphore_fixalign(a), FUTEX_WAKE, 1, 0, 0, 0);
 }
 template <class A>
 inline void __semaphore_wake_all(volatile A &a)
 {
     syscall(SYS_futex, __semaphore_fixalign(a), FUTEX_WAKE, INT_MAX, 0, 0, 0);
 }
 #endif // __linux__

 #if defined(WIN32) && _WIN32_WINNT >= 0x0602
 // On Windows, we make use of the kernel memory wait operations as well. These first became available with Windows 8.
 template <class A, class V>
 void __semaphore_wait(A &a, V v)
 {
     static_assert(sizeof(V) <= 8, "Windows only allows sizes between 1B and 8B for WaitOnAddress.");
     WaitOnAddress((PVOID)&a, (PVOID)&v, sizeof(v), -1);
 }
 template <class A, class V, class Rep, class Period>
 void __semaphore_wait_timed(A &a, V v, chrono::duration<Rep, Period> const &delta)
 {
     static_assert(sizeof(V) <= 8, "Windows only allows sizes between 1B and 8B for WaitOnAddress.");
     WaitOnAddress((PVOID)&a, (PVOID)&v, sizeof(v), (DWORD)chrono::duration_cast<chrono::milliseconds>(delta).count());
 }
 template <class A>
 inline void __semaphore_wake_one(A &a)
 {
     WakeByAddressSingle((PVOID)&a);
 }
 template <class A>
 inline void __semaphore_wake_all(A &a)
 {
     WakeByAddressAll((PVOID)&a);
 }
 #endif // defined(WIN32) && _WIN32_WINNT >= 0x0602

 #endif // __semaphore_cuda

 template<class Fn>
 __semaphore_abi bool __binary_semaphore_acquire_slow(
     atomic<binary_semaphore::count_type>& atom, atomic<binary_semaphore::count_type>& ticket,
     atomic<binary_semaphore::count_type>& tocket, bool const&, Fn fn) noexcept
 {
     uint32_t const tick = ticket.fetch_add(1, std::memory_order_relaxed);
     uint32_t tock = tocket.load(std::memory_order_relaxed);
     uint32_t contbit = 0u;
 #ifdef __semaphore_fast_path
     uint32_t sum = 0u;
     while(1) {
         if(sum < 64*1024) {
 #else
     while(1) {
 #endif
             uint32_t const delta = (tick - tock) * 128;
 #if !defined(__CUDA_ARCH__)
             std::this_thread::sleep_for(std::chrono::nanoseconds(delta));
 #elif defined(__has_cuda_nanosleep)
             details::__mme_nanosleep(delta);
 #endif
 #ifdef __semaphore_fast_path
             sum += delta;
         }
         else
         {
             uint32_t old = atom.fetch_or(binary_semaphore::__slowbit, std::memory_order_relaxed) | binary_semaphore::__slowbit;
             if ((old & binary_semaphore::__valubit) != 0) {
                 atomic_thread_fence(std::memory_order_seq_cst);
                 if(!fn(old))
                     return false;
             }
         }
         uint32_t old = atom.load(std::memory_order_relaxed);
 #else
         uint32_t old = atom.load(std::memory_order_relaxed);
         if(!fn(old))
             return false;
 #endif
         tock = tocket.load(std::memory_order_relaxed);
         if(tock != tick)
             continue;
         while ((old & binary_semaphore::__valubit) == 0) {
             old &= ~binary_semaphore::__lockbit;
             uint32_t next = old - contbit + binary_semaphore::__valubit;
             if (atom.compare_exchange_weak(old, next, std::memory_order_acquire, std::memory_order_relaxed))
                 return true;
         }
         if(contbit == 0)
             atom.fetch_add(contbit = binary_semaphore::__contbit, std::memory_order_relaxed);
     }
 }

 }

 #ifdef __semaphore_fast_path
 __semaphore_abi void binary_semaphore::__release_slow(count_type old) noexcept
 {
     count_type lock = 0;
     do {
         old &= ~__lockbit;
         lock = (old & __slowbit) ? __lockbit : 0;
     } while (!__atom.compare_exchange_weak(old, (old | lock) & ~(__valubit | __slowbit), std::memory_order_release, std::memory_order_relaxed));
     if (lock != 0)
     {
         atomic_thread_fence(std::memory_order_seq_cst);
         details::__semaphore_wake_all(__atom);
         __atom.fetch_and(~__lockbit, std::memory_order_release);
     }
 }
 #endif

 __semaphore_abi void binary_semaphore::__acquire_slow() noexcept
 {
     auto const fn = [=] __semaphore_abi (uint32_t old) -> bool {
 #ifdef __semaphore_fast_path
         details::__semaphore_wait(__atom, old);
 #else
         (void)old;
 #endif
         return true;
     };
     details::__binary_semaphore_acquire_slow(__atom, __ticket, __tocket, __stolen, fn);
 }

 #ifndef __semaphore_cuda

 __semaphore_abi bool binary_semaphore::__acquire_slow_timed(std::chrono::time_point<details::__semaphore_clock, details::__semaphore_duration> const& abs_time) noexcept
 {
     auto const fn = [=](uint32_t old) __semaphore_abi -> bool {
 #ifdef __semaphore_fast_path
         auto rel_time = abs_time - details::__semaphore_clock::now();
         if(rel_time > std::chrono::microseconds(0))
             details::__semaphore_wait_timed(__atom, old, rel_time);
 #else
         (void)old;
 #endif
         return details::__semaphore_clock::now() < abs_time;
     };
     return details::__binary_semaphore_acquire_slow(__atom, __ticket, __tocket, __stolen, fn);
 }

 #endif

 #ifndef __semaphore_sem

 __semaphore_abi bool counting_semaphore::__fetch_sub_if_slow(counting_semaphore::count_type old, std::memory_order order) noexcept
 {

     do
     {
         old &= ~__lockmask;
         if (atom.compare_exchange_weak(old, old - (1 << __shift), order, std::memory_order_relaxed))
             return true;
     } while ((old >> __shift) >= 1);

     return false;
 }

 #ifdef __semaphore_fast_path
 void counting_semaphore::__fetch_add_slow(counting_semaphore::count_type term, counting_semaphore::count_type old, std::memory_order order, semaphore_notify notify) noexcept
 {
     while (1)
     {

         bool const apply_lock = ((old & __contmask) != 0) && (notify != semaphore_notify::none);
         int const set = ((old & __valumask) + (term << __shift)) | (apply_lock ? __lockmask : 0);

         old &= ~__lockmask;
         if (atom.compare_exchange_weak(old, set, order, std::memory_order_relaxed))
         {
             if (apply_lock)
             {
                 switch (notify)
                 {
                 case semaphore_notify_all:
                     details::__semaphore_wake_all(atom);
                     break;
                 case semaphore_notify_one:
                     details::__semaphore_wake_one(atom);
                     break;
                 case semaphore_notify_none:
                     break;
                 }
                 atom.fetch_and(~__lockmask, std::memory_order_relaxed);
             }
             break;
         }
     }
 }
 #endif

 __semaphore_abi void counting_semaphore::__acquire_slow(std::memory_order order) noexcept
 {

     int old;
     details::__semaphore_exponential_backoff b;
 #ifdef __semaphore_fast_path
     for (int i = 0; i < 2; ++i)
     {
 #else
     while (1)
     {
 #endif
         b.sleep();
         old = atom.load(order);
         if ((old >> __shift) >= 1)
             goto done;
     }
 #ifdef __semaphore_fast_path
     while (1)
     {
         old = atom.fetch_or(__contmask, std::memory_order_relaxed) | __contmask;
         if ((old >> __shift) >= 1)
             goto done;
         details::__semaphore_wait(atom, old);
         old = atom.load(order);
         if ((old >> __shift) >= 1)
             goto done;
     }
 #endif
 done:
 #ifdef __semaphore_fast_path
     while (old & __lockmask)
         old = atom.load(std::memory_order_relaxed);
 #else
     ;
 #endif
 }
 #endif

 #ifndef __semaphore_cuda

 static constexpr int __atomic_wait_table_entry_size = sizeof(synchronic) > alignof(synchronic) ? sizeof(synchronic) : alignof(synchronic);

 static constexpr int __atomic_wait_table_entry_count = 1024;

 __semaphore_managed alignas(64) unsigned char __atomic_wait_table[__atomic_wait_table_entry_count][__atomic_wait_table_entry_size] = { 0 };

 __semaphore_abi size_t __atomic_wait_table_index(void const* ptr) {

     return ((uintptr_t)ptr / __atomic_wait_table_entry_size) & (__atomic_wait_table_entry_count - 1);
 }

 __semaphore_abi synchronic *__atomic_wait_get_semaphore(void const *a)
 {
     return (synchronic *)&__atomic_wait_table[__atomic_wait_table_index(a)][0];
 }

 #endif
 }
 } // namespace experimental
 } // namespace std
	/*

	Copyright (c) 2017, NVIDIA Corporation
	All rights reserved.

	Redistribution and use in source and binary forms, with or without modification,
	are permitted provided that the following conditions are met:

	1. Redistributions of source code must retain the above copyright notice, this
	list of conditions and the following disclaimer.

	2. Redistributions in binary form must reproduce the above copyright notice,
	this list of conditions and the following disclaimer in the documentation
	and/or other materials provided with the distribution.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
	INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
	OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
	OF THE POSSIBILITY OF SUCH DAMAGE.

	*/
	#ifdef __semaphore_cuda
	namespace cuda
	{
	#else
	#include "semaphore"
	namespace std
	{
	#endif //__semaphore_cuda

	namespace experimental
	{
	inline namespace v1 {
	namespace details
	{
	#ifdef __semaphore_cuda

	#else

	#ifdef __linux__
	// On Linux, we make use of the kernel memory wait operations. These have been available for a long time.
	template <class A>
	inline const void *__semaphore_fixalign(A &a)
	{
	static_assert(sizeof(A) <= sizeof(int), "Linux only supports 'int' for Futex.");
	return (const void *)((intptr_t)&a & ~(sizeof(int) - 1));
	}
	inline int __semaphore_readint(const void *p)
	{
	int i;
	memcpy(&i, p, sizeof(int));
	return i;
	}
	template <class A, class V>
	inline void __semaphore_wait(A &a, V v)
	{
	auto p = __semaphore_fixalign(a);
	auto i = __semaphore_readint(p);
	asm volatile("" ::
	: "memory");
	if (a.load(memory_order_relaxed) != v)
	return;
	syscall(SYS_futex, p, FUTEX_WAIT_PRIVATE, i, 0, 0, 0);
	}
	template <class A, class V, class Rep, class Period>
	void __semaphore_wait_timed(A &a, V v, const chrono::duration<Rep, Period> &t)
	{
	auto p = __semaphore_fixalign(a);
	auto i = __semaphore_readint(p);
	asm volatile("" ::
	: "memory");
	if (a.load(memory_order_relaxed) != v)
	return;
	syscall(SYS_futex, p, FUTEX_WAIT_PRIVATE, i, __semaphore_to_timespec(t), 0, 0);
	}
	template <class A>
	inline void __semaphore_wake_one(A &a)
	{
	syscall(SYS_futex, __semaphore_fixalign(a), FUTEX_WAKE_PRIVATE, 1, 0, 0, 0);
	}
	template <class A>
	inline void __semaphore_wake_all(A &a)
	{
	syscall(SYS_futex, __semaphore_fixalign(a), FUTEX_WAKE_PRIVATE, INT_MAX, 0, 0, 0);
	}
	template <class A, class V>
	inline void __semaphore_wait(volatile A &a, V v)
	{
	auto p = __semaphore_fixalign(a);
	auto i = __semaphore_readint(p);
	asm volatile("" ::
	: "memory");
	if (a.load(memory_order_relaxed) != v)
	return;
	syscall(SYS_futex, p, FUTEX_WAIT, i, 0, 0, 0);
	}
	template <class A, class V, class Rep, class Period>
	void __semaphore_wait_timed(volatile A &a, V v, const chrono::duration<Rep, Period> &t)
	{
	auto p = __semaphore_fixalign(a);
	auto i = __semaphore_readint(p);
	asm volatile("" ::
	: "memory");
	if (a.load(memory_order_relaxed) != v)
	return;
	syscall(SYS_futex, p, FUTEX_WAIT, i, details::__semaphore_to_timespec(t), 0, 0);
	}
	template <class A>
	inline void __semaphore_wake_one(volatile A &a)
	{
	syscall(SYS_futex, __semaphore_fixalign(a), FUTEX_WAKE, 1, 0, 0, 0);
	}
	template <class A>
	inline void __semaphore_wake_all(volatile A &a)
	{
	syscall(SYS_futex, __semaphore_fixalign(a), FUTEX_WAKE, INT_MAX, 0, 0, 0);
	}
	#endif // __linux__

	#if defined(WIN32) && _WIN32_WINNT >= 0x0602
	// On Windows, we make use of the kernel memory wait operations as well. These first became available with Windows 8.
	template <class A, class V>
	void __semaphore_wait(A &a, V v)
	{
	static_assert(sizeof(V) <= 8, "Windows only allows sizes between 1B and 8B for WaitOnAddress.");
	WaitOnAddress((PVOID)&a, (PVOID)&v, sizeof(v), -1);
	}
	template <class A, class V, class Rep, class Period>
	void __semaphore_wait_timed(A &a, V v, chrono::duration<Rep, Period> const &delta)
	{
	static_assert(sizeof(V) <= 8, "Windows only allows sizes between 1B and 8B for WaitOnAddress.");
	WaitOnAddress((PVOID)&a, (PVOID)&v, sizeof(v), (DWORD)chrono::duration_cast<chrono::milliseconds>(delta).count());
	}
	template <class A>
	inline void __semaphore_wake_one(A &a)
	{
	WakeByAddressSingle((PVOID)&a);
	}
	template <class A>
	inline void __semaphore_wake_all(A &a)
	{
	WakeByAddressAll((PVOID)&a);
	}
	#endif // defined(WIN32) && _WIN32_WINNT >= 0x0602

	#endif // __semaphore_cuda

	template<class Fn>
	__semaphore_abi bool __binary_semaphore_acquire_slow(
	atomic<binary_semaphore::count_type>& atom, atomic<binary_semaphore::count_type>& ticket,
	atomic<binary_semaphore::count_type>& tocket, bool const&, Fn fn) noexcept
	{
	uint32_t const tick = ticket.fetch_add(1, std::memory_order_relaxed);
	uint32_t tock = tocket.load(std::memory_order_relaxed);
	uint32_t contbit = 0u;
	#ifdef __semaphore_fast_path
	uint32_t sum = 0u;
	while(1) {
	if(sum < 64*1024) {
	#else
	while(1) {
	#endif
	uint32_t const delta = (tick - tock) * 128;
	#if !defined(__CUDA_ARCH__)
	std::this_thread::sleep_for(std::chrono::nanoseconds(delta));
	#elif defined(__has_cuda_nanosleep)
	details::__mme_nanosleep(delta);
	#endif
	#ifdef __semaphore_fast_path
	sum += delta;
	}
	else
	{
	uint32_t old = atom.fetch_or(binary_semaphore::__slowbit, std::memory_order_relaxed) \| binary_semaphore::__slowbit;
	if ((old & binary_semaphore::__valubit) != 0) {
	atomic_thread_fence(std::memory_order_seq_cst);
	if(!fn(old))
	return false;
	}
	}
	uint32_t old = atom.load(std::memory_order_relaxed);
	#else
	uint32_t old = atom.load(std::memory_order_relaxed);
	if(!fn(old))
	return false;
	#endif
	tock = tocket.load(std::memory_order_relaxed);
	if(tock != tick)
	continue;
	while ((old & binary_semaphore::__valubit) == 0) {
	old &= ~binary_semaphore::__lockbit;
	uint32_t next = old - contbit + binary_semaphore::__valubit;
	if (atom.compare_exchange_weak(old, next, std::memory_order_acquire, std::memory_order_relaxed))
	return true;
	}
	if(contbit == 0)
	atom.fetch_add(contbit = binary_semaphore::__contbit, std::memory_order_relaxed);
	}
	}

	}

	#ifdef __semaphore_fast_path
	__semaphore_abi void binary_semaphore::__release_slow(count_type old) noexcept
	{
	count_type lock = 0;
	do {
	old &= ~__lockbit;
	lock = (old & __slowbit) ? __lockbit : 0;
	} while (!__atom.compare_exchange_weak(old, (old \| lock) & ~(__valubit \| __slowbit), std::memory_order_release, std::memory_order_relaxed));
	if (lock != 0)
	{
	atomic_thread_fence(std::memory_order_seq_cst);
	details::__semaphore_wake_all(__atom);
	__atom.fetch_and(~__lockbit, std::memory_order_release);
	}
	}
	#endif

	__semaphore_abi void binary_semaphore::__acquire_slow() noexcept
	{
	auto const fn = [=] __semaphore_abi (uint32_t old) -> bool {
	#ifdef __semaphore_fast_path
	details::__semaphore_wait(__atom, old);
	#else
	(void)old;
	#endif
	return true;
	};
	details::__binary_semaphore_acquire_slow(__atom, __ticket, __tocket, __stolen, fn);
	}

	#ifndef __semaphore_cuda

	__semaphore_abi bool binary_semaphore::__acquire_slow_timed(std::chrono::time_point<details::__semaphore_clock, details::__semaphore_duration> const& abs_time) noexcept
	{
	auto const fn = [=](uint32_t old) __semaphore_abi -> bool {
	#ifdef __semaphore_fast_path
	auto rel_time = abs_time - details::__semaphore_clock::now();
	if(rel_time > std::chrono::microseconds(0))
	details::__semaphore_wait_timed(__atom, old, rel_time);
	#else
	(void)old;
	#endif
	return details::__semaphore_clock::now() < abs_time;
	};
	return details::__binary_semaphore_acquire_slow(__atom, __ticket, __tocket, __stolen, fn);
	}

	#endif

	#ifndef __semaphore_sem

	__semaphore_abi bool counting_semaphore::__fetch_sub_if_slow(counting_semaphore::count_type old, std::memory_order order) noexcept
	{

	do
	{
	old &= ~__lockmask;
	if (atom.compare_exchange_weak(old, old - (1 << __shift), order, std::memory_order_relaxed))
	return true;
	} while ((old >> __shift) >= 1);

	return false;
	}

	#ifdef __semaphore_fast_path
	void counting_semaphore::__fetch_add_slow(counting_semaphore::count_type term, counting_semaphore::count_type old, std::memory_order order, semaphore_notify notify) noexcept
	{
	while (1)
	{

	bool const apply_lock = ((old & __contmask) != 0) && (notify != semaphore_notify::none);
	int const set = ((old & __valumask) + (term << __shift)) \| (apply_lock ? __lockmask : 0);

	old &= ~__lockmask;
	if (atom.compare_exchange_weak(old, set, order, std::memory_order_relaxed))
	{
	if (apply_lock)
	{
	switch (notify)
	{
	case semaphore_notify_all:
	details::__semaphore_wake_all(atom);
	break;
	case semaphore_notify_one:
	details::__semaphore_wake_one(atom);
	break;
	case semaphore_notify_none:
	break;
	}
	atom.fetch_and(~__lockmask, std::memory_order_relaxed);
	}
	break;
	}
	}
	}
	#endif

	__semaphore_abi void counting_semaphore::__acquire_slow(std::memory_order order) noexcept
	{

	int old;
	details::__semaphore_exponential_backoff b;
	#ifdef __semaphore_fast_path
	for (int i = 0; i < 2; ++i)
	{
	#else
	while (1)
	{
	#endif
	b.sleep();
	old = atom.load(order);
	if ((old >> __shift) >= 1)
	goto done;
	}
	#ifdef __semaphore_fast_path
	while (1)
	{
	old = atom.fetch_or(__contmask, std::memory_order_relaxed) \| __contmask;
	if ((old >> __shift) >= 1)
	goto done;
	details::__semaphore_wait(atom, old);
	old = atom.load(order);
	if ((old >> __shift) >= 1)
	goto done;
	}
	#endif
	done:
	#ifdef __semaphore_fast_path
	while (old & __lockmask)
	old = atom.load(std::memory_order_relaxed);
	#else
	;
	#endif
	}
	#endif

	#ifndef __semaphore_cuda

	static constexpr int __atomic_wait_table_entry_size = sizeof(synchronic) > alignof(synchronic) ? sizeof(synchronic) : alignof(synchronic);

	static constexpr int __atomic_wait_table_entry_count = 1024;

	__semaphore_managed alignas(64) unsigned char __atomic_wait_table[__atomic_wait_table_entry_count][__atomic_wait_table_entry_size] = { 0 };

	__semaphore_abi size_t __atomic_wait_table_index(void const* ptr) {

	return ((uintptr_t)ptr / __atomic_wait_table_entry_size) & (__atomic_wait_table_entry_count - 1);
	}

	__semaphore_abi synchronic __atomic_wait_get_semaphore(void const a)
	{
	return (synchronic *)&__atomic_wait_table[__atomic_wait_table_index(a)][0];
	}

	#endif
	}
	} // namespace experimental
	} // namespace std