$OpenBSD: patch-vpx_ports_x86_h,v 1.1 2015/02/11 00:07:47 brad Exp $

https://chromium.googlesource.com/webm/libvpx/+/9f268611472bbdfa751aedb74a306ee502e7faca

Support for extended feature flags enumeration leaf in CPUID instruction

This CL fixes an overcite with the AVX2 support CL previously
merged (Change-Id: Idc03f3fca4bf2d0afd33631ea1d3caf8fc34ec29) that
prevented runtime execution of AVX2 code in WebM. 

Background:
Starting with the Sandybridge processor, the CPUID instruction was
enhanced to add various extended feature flag enumeration leaves.
Reading these leaves requires an additional input value for the CPUID
instruction which is stored in ECX. This change adds this second input
value for all ARCH_X86 and ARCH_x86_64 targets to the CPUID macros,
allowing checks of EBX bit 5 for AVX2 support. This capability will be
required moving forward to check for future processor features.

https://chromium.googlesource.com/webm/libvpx/+/7e515c46372e6dee7f47bae3733378756e40783a

fix AVX & AVX2 detection

fixes issue #790 which resulted in a SIGILL on OpenBSD

code is mostly from libwebp, based on the following:

https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family

https://chromium.googlesource.com/webm/libvpx/+/4ed1bda7af3a5da820068d68c7af80103bc8f3ba

x86: correct OSXSAVE + AVX bit check

the result should have both bits set; previously this was converted from
webp incorrectly and resulted in a boolean check...

--- vpx_ports/x86.h.orig	Fri Jan 10 15:12:42 2014
+++ vpx_ports/x86.h	Thu Feb  5 20:35:14 2015
@@ -13,6 +13,7 @@
 #define VPX_PORTS_X86_H
 #include <stdlib.h>
 #include "vpx_config.h"
+#include "vpx/vpx_integer.h"
 
 typedef enum {
   VPX_CPU_UNKNOWN = -1,
@@ -35,51 +36,53 @@ typedef enum {
 
 #if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__)
 #if ARCH_X86_64
-#define cpuid(func,ax,bx,cx,dx)\
+#define cpuid(func, func2, ax, bx, cx, dx)\
   __asm__ __volatile__ (\
                         "cpuid           \n\t" \
                         : "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) \
-                        : "a"  (func));
+                        : "a" (func), "c" (func2));
 #else
-#define cpuid(func,ax,bx,cx,dx)\
+#define cpuid(func, func2, ax, bx, cx, dx)\
   __asm__ __volatile__ (\
                         "mov %%ebx, %%edi   \n\t" \
                         "cpuid              \n\t" \
                         "xchg %%edi, %%ebx  \n\t" \
                         : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \
-                        : "a" (func));
+                        : "a" (func), "c" (func2));
 #endif
 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
 #if ARCH_X86_64
-#define cpuid(func,ax,bx,cx,dx)\
+#define cpuid(func, func2, ax, bx, cx, dx)\
   asm volatile (\
                 "xchg %rsi, %rbx \n\t" \
                 "cpuid           \n\t" \
                 "movl %ebx, %edi \n\t" \
                 "xchg %rsi, %rbx \n\t" \
                 : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \
-                : "a"  (func));
+                : "a" (func), "c" (func2));
 #else
-#define cpuid(func,ax,bx,cx,dx)\
+#define cpuid(func, func2, ax, bx, cx, dx)\
   asm volatile (\
                 "pushl %ebx       \n\t" \
                 "cpuid            \n\t" \
                 "movl %ebx, %edi  \n\t" \
                 "popl %ebx        \n\t" \
                 : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \
-                : "a" (func));
+                : "a" (func), "c" (func2));
 #endif
 #else /* end __SUNPRO__ */
 #if ARCH_X86_64
 void __cpuid(int CPUInfo[4], int info_type);
 #pragma intrinsic(__cpuid)
-#define cpuid(func,a,b,c,d) do{\
+#define cpuid(func, func2, a, b, c, d) do{\
     int regs[4];\
-    __cpuid(regs,func); a=regs[0];  b=regs[1];  c=regs[2];  d=regs[3];\
+    __cpuid(regs, func, func2);
+    a = regs[0];  b = regs[1];  c = regs[2];  d = regs[3];\
   } while(0)
 #else
-#define cpuid(func,a,b,c,d)\
+#define cpuid(func, func2, a, b, c, d)\
   __asm mov eax, func\
+  __asm mov ecx, func2\
   __asm cpuid\
   __asm mov a, eax\
   __asm mov b, ebx\
@@ -88,6 +91,37 @@ void __cpuid(int CPUInfo[4], int info_type);
 #endif
 #endif /* end others */
 
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static INLINE uint64_t xgetbv(void) {
+  const uint32_t ecx = 0;
+  uint32_t eax, edx;
+  // Use the raw opcode for xgetbv for compatibility with older toolchains.
+  __asm__ volatile (
+    ".byte 0x0f, 0x01, 0xd0\n"
+    : "=a"(eax), "=d"(edx) : "c" (ecx));
+  return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static INLINE uint64_t xgetbv(void) {
+  uint32_t eax_, edx_;
+  __asm {
+    xor ecx, ecx  // ecx = 0
+    // Use the raw opcode for xgetbv for compatibility with older toolchains.
+    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+    mov eax_, eax
+    mov edx_, edx
+  }
+  return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
+#endif
+
 #define HAS_MMX     0x01
 #define HAS_SSE     0x02
 #define HAS_SSE2    0x04
@@ -120,13 +154,13 @@ x86_simd_caps(void) {
     mask = strtol(env, NULL, 0);
 
   /* Ensure that the CPUID instruction supports extended features */
-  cpuid(0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
 
   if (reg_eax < 1)
     return 0;
 
   /* Get the standard feature flags */
-  cpuid(1, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
 
   if (reg_edx & BIT(23)) flags |= HAS_MMX;
 
@@ -140,9 +174,15 @@ x86_simd_caps(void) {
 
   if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
 
-  if (reg_ecx & BIT(28)) flags |= HAS_AVX;
-
-  if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+  if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
+    if ((xgetbv() & 0x6) == 0x6) {
+      flags |= HAS_AVX;
+      /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
+      cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+      if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+    }
+  }
 
   return flags & mask;
 }
