Updated SDL's YUV support, many thanks to Adrien Descamps

New functions get and set the YUV colorspace conversion mode: SDL_SetYUVConversionMode() SDL_GetYUVConversionMode() SDL_GetYUVConversionModeForResolution() SDL_ConvertPixels() converts between all supported RGB and YUV formats, with SSE acceleration for converting from planar YUV formats (YV12, NV12, etc) to common RGB/RGBA formats. Added a new test program, testyuv, to verify correctness and speed of YUV conversion functionality.
2017-11-12 22:51:12 -08:00 · 2017-11-12 22:51:12 -08:00 · 145d2469ae
commit 145d2469ae
parent e7cc03e0bd
60 changed files with 8368 additions and 4310 deletions
--- a/src/render/SDL_yuv_mmx.c
+++ b/src/render/SDL_yuv_mmx.c
@ -1,409 +0,0 @@
-/*
-  Simple DirectMedia Layer
-  Copyright (C) 1997-2017 Sam Lantinga <slouken@libsdl.org>
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-*/
-#include "../SDL_internal.h"
-
-#include "SDL_yuv_mmx_c.h"
-
-#ifdef USE_MMX_ASSEMBLY
-
-#include "SDL_stdinc.h"
-
-#include "mmx.h"
-
-/* *INDENT-OFF* */
-
-static mmx_t MMX_0080w    = { .ud = {0x00800080, 0x00800080} };
-static mmx_t MMX_00FFw    = { .ud = {0x00ff00ff, 0x00ff00ff} };
-static mmx_t MMX_FF00w    = { .ud = {0xff00ff00, 0xff00ff00} };
-
-static mmx_t MMX_Ycoeff   = { .uw = {0x004a, 0x004a, 0x004a, 0x004a} };
-
-static mmx_t MMX_UbluRGB  = { .uw = {0x0072, 0x0072, 0x0072, 0x0072} };
-static mmx_t MMX_VredRGB  = { .uw = {0x0059, 0x0059, 0x0059, 0x0059} };
-static mmx_t MMX_UgrnRGB  = { .uw = {0xffea, 0xffea, 0xffea, 0xffea} };
-static mmx_t MMX_VgrnRGB  = { .uw = {0xffd2, 0xffd2, 0xffd2, 0xffd2} };
-
-static mmx_t MMX_Ublu5x5  = { .uw = {0x0081, 0x0081, 0x0081, 0x0081} };
-static mmx_t MMX_Vred5x5  = { .uw = {0x0066, 0x0066, 0x0066, 0x0066} };
-static mmx_t MMX_Ugrn565  = { .uw = {0xffe8, 0xffe8, 0xffe8, 0xffe8} };
-static mmx_t MMX_Vgrn565  = { .uw = {0xffcd, 0xffcd, 0xffcd, 0xffcd} };
-
-static mmx_t MMX_red565   = { .uw = {0xf800, 0xf800, 0xf800, 0xf800} };
-static mmx_t MMX_grn565   = { .uw = {0x07e0, 0x07e0, 0x07e0, 0x07e0} };
-
-/**
-   This MMX assembler is my first assembler/MMX program ever.
-   Thus it maybe buggy.
-   Send patches to:
-   mvogt@rhrk.uni-kl.de
-
-   After it worked fine I have "obfuscated" the code a bit to have
-   more parallism in the MMX units. This means I moved
-   initilisation around and delayed other instruction.
-   Performance measurement did not show that this brought any advantage
-   but in theory it _should_ be faster this way.
-
-   The overall performanve gain to the C based dither was 30%-40%.
-   The MMX routine calculates 256bit=8RGB values in each cycle
-   (4 for row1 & 4 for row2)
-
-   The red/green/blue.. coefficents are taken from the mpeg_play
-   player. They look nice, but I dont know if you can have
-   better values, to avoid integer rounding errors.
-
-
-   IMPORTANT:
-   ==========
-
-   It is a requirement that the cr/cb/lum are 8 byte aligned and
-   the out are 16byte aligned or you will/may get segfaults
-
-*/
-
-void ColorRGBDitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix,
-                              unsigned char *lum, unsigned char *cr,
-                              unsigned char *cb, unsigned char *out,
-                              int rows, int cols, int mod )
-{
-    Uint32 *row1;
-    Uint32 *row2;
-
-    unsigned char* y = lum +cols*rows;    /* Pointer to the end */
-    int x = 0;
-    row1 = (Uint32 *)out;                 /* 32 bit target */
-    row2 = (Uint32 *)out+cols+mod;        /* start of second row */
-    mod = (mod+cols+mod)*4;               /* increment for row1 in byte */
-
-    __asm__ __volatile__ (
-        ".align 8\n"
-        "1:\n"
-
-        /* create Cr (result in mm1) */
-        "movd (%0),%%mm1\n"   /*         0  0  0  0  v3 v2 v1 v0 */
-        "pxor %%mm7,%%mm7\n"      /*         00 00 00 00 00 00 00 00 */
-        "movd (%2), %%mm2\n"           /*    0  0  0  0 l3 l2 l1 l0 */
-        "punpcklbw %%mm7,%%mm1\n" /*         0  v3 0  v2 00 v1 00 v0 */
-        "punpckldq %%mm1,%%mm1\n" /*         00 v1 00 v0 00 v1 00 v0 */
-        "psubw %9,%%mm1\n"        /* mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 */
-
-        /* create Cr_g (result in mm0) */
-        "movq %%mm1,%%mm0\n"           /* r1 r1 r0 r0 r1 r1 r0 r0 */
-        "pmullw %10,%%mm0\n"           /* red*-46dec=0.7136*64 */
-        "pmullw %11,%%mm1\n"           /* red*89dec=1.4013*64 */
-        "psraw  $6, %%mm0\n"           /* red=red/64 */
-        "psraw  $6, %%mm1\n"           /* red=red/64 */
-
-        /* create L1 L2 (result in mm2,mm4) */
-        /* L2=lum+cols */
-        "movq (%2,%4),%%mm3\n"         /*    0  0  0  0 L3 L2 L1 L0 */
-        "punpckldq %%mm3,%%mm2\n"      /*   L3 L2 L1 L0 l3 l2 l1 l0 */
-        "movq %%mm2,%%mm4\n"           /*   L3 L2 L1 L0 l3 l2 l1 l0 */
-        "pand %12,%%mm2\n"             /*   L3 0  L1  0 l3  0 l1  0 */
-        "pand %13,%%mm4\n"             /*   0  L2  0 L0  0 l2  0 l0 */
-        "psrlw $8,%%mm2\n"             /*   0  L3  0 L1  0 l3  0 l1 */
-
-        /* create R (result in mm6) */
-        "movq %%mm2,%%mm5\n"           /*   0 L3  0 L1  0 l3  0 l1 */
-        "movq %%mm4,%%mm6\n"           /*   0 L2  0 L0  0 l2  0 l0 */
-        "paddsw  %%mm1, %%mm5\n"       /* lum1+red:x R3 x R1 x r3 x r1 */
-        "paddsw  %%mm1, %%mm6\n"       /* lum1+red:x R2 x R0 x r2 x r0 */
-        "packuswb %%mm5,%%mm5\n"       /*  R3 R1 r3 r1 R3 R1 r3 r1 */
-        "packuswb %%mm6,%%mm6\n"       /*  R2 R0 r2 r0 R2 R0 r2 r0 */
-        "pxor %%mm7,%%mm7\n"      /*         00 00 00 00 00 00 00 00 */
-        "punpcklbw %%mm5,%%mm6\n"      /*  R3 R2 R1 R0 r3 r2 r1 r0 */
-
-        /* create Cb (result in mm1) */
-        "movd (%1), %%mm1\n"      /*         0  0  0  0  u3 u2 u1 u0 */
-        "punpcklbw %%mm7,%%mm1\n" /*         0  u3 0  u2 00 u1 00 u0 */
-        "punpckldq %%mm1,%%mm1\n" /*         00 u1 00 u0 00 u1 00 u0 */
-        "psubw %9,%%mm1\n"        /* mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 */
-
-        /* create Cb_g (result in mm5) */
-        "movq %%mm1,%%mm5\n"            /* u1 u1 u0 u0 u1 u1 u0 u0 */
-        "pmullw %14,%%mm5\n"            /* blue*-109dec=1.7129*64 */
-        "pmullw %15,%%mm1\n"            /* blue*114dec=1.78125*64 */
-        "psraw  $6, %%mm5\n"            /* blue=red/64 */
-        "psraw  $6, %%mm1\n"            /* blue=blue/64 */
-
-        /* create G (result in mm7) */
-        "movq %%mm2,%%mm3\n"      /*   0  L3  0 L1  0 l3  0 l1 */
-        "movq %%mm4,%%mm7\n"      /*   0  L2  0 L0  0 l2  0 l1 */
-        "paddsw  %%mm5, %%mm3\n"  /* lum1+Cb_g:x G3t x G1t x g3t x g1t */
-        "paddsw  %%mm5, %%mm7\n"  /* lum1+Cb_g:x G2t x G0t x g2t x g0t */
-        "paddsw  %%mm0, %%mm3\n"  /* lum1+Cr_g:x G3  x G1  x g3  x g1 */
-        "paddsw  %%mm0, %%mm7\n"  /* lum1+blue:x G2  x G0  x g2  x g0 */
-        "packuswb %%mm3,%%mm3\n"  /* G3 G1 g3 g1 G3 G1 g3 g1 */
-        "packuswb %%mm7,%%mm7\n"  /* G2 G0 g2 g0 G2 G0 g2 g0 */
-        "punpcklbw %%mm3,%%mm7\n" /* G3 G2 G1 G0 g3 g2 g1 g0 */
-
-        /* create B (result in mm5) */
-        "movq %%mm2,%%mm3\n"         /*   0  L3  0 L1  0 l3  0 l1 */
-        "movq %%mm4,%%mm5\n"         /*   0  L2  0 L0  0 l2  0 l1 */
-        "paddsw  %%mm1, %%mm3\n"     /* lum1+blue:x B3 x B1 x b3 x b1 */
-        "paddsw  %%mm1, %%mm5\n"     /* lum1+blue:x B2 x B0 x b2 x b0 */
-        "packuswb %%mm3,%%mm3\n"     /* B3 B1 b3 b1 B3 B1 b3 b1 */
-        "packuswb %%mm5,%%mm5\n"     /* B2 B0 b2 b0 B2 B0 b2 b0 */
-        "punpcklbw %%mm3,%%mm5\n"    /* B3 B2 B1 B0 b3 b2 b1 b0 */
-
-        /* fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb) */
-
-        "pxor %%mm2,%%mm2\n"           /*  0  0  0  0  0  0  0  0 */
-        "pxor %%mm4,%%mm4\n"           /*  0  0  0  0  0  0  0  0 */
-        "movq %%mm6,%%mm1\n"           /* R3 R2 R1 R0 r3 r2 r1 r0 */
-        "movq %%mm5,%%mm3\n"           /* B3 B2 B1 B0 b3 b2 b1 b0 */
-
-        /* process lower lum */
-        "punpcklbw %%mm4,%%mm1\n"      /*  0 r3  0 r2  0 r1  0 r0 */
-        "punpcklbw %%mm4,%%mm3\n"      /*  0 b3  0 b2  0 b1  0 b0 */
-        "movq %%mm1,%%mm2\n"           /*  0 r3  0 r2  0 r1  0 r0 */
-        "movq %%mm3,%%mm0\n"           /*  0 b3  0 b2  0 b1  0 b0 */
-        "punpcklwd %%mm1,%%mm3\n"      /*  0 r1  0 b1  0 r0  0 b0 */
-        "punpckhwd %%mm2,%%mm0\n"      /*  0 r3  0 b3  0 r2  0 b2 */
-
-        "pxor %%mm2,%%mm2\n"           /*  0  0  0  0  0  0  0  0 */
-        "movq %%mm7,%%mm1\n"           /* G3 G2 G1 G0 g3 g2 g1 g0 */
-        "punpcklbw %%mm1,%%mm2\n"      /* g3  0 g2  0 g1  0 g0  0 */
-        "punpcklwd %%mm4,%%mm2\n"      /*  0  0 g1  0  0  0 g0  0 */
-        "por %%mm3, %%mm2\n"          /*  0 r1 g1 b1  0 r0 g0 b0 */
-        "movq %%mm2,(%3)\n"          /* wrote out ! row1 */
-
-        "pxor %%mm2,%%mm2\n"           /*  0  0  0  0  0  0  0  0 */
-        "punpcklbw %%mm1,%%mm4\n"      /* g3  0 g2  0 g1  0 g0  0 */
-        "punpckhwd %%mm2,%%mm4\n"      /*  0  0 g3  0  0  0 g2  0 */
-        "por %%mm0, %%mm4\n"          /*  0 r3 g3 b3  0 r2 g2 b2 */
-        "movq %%mm4,8(%3)\n"         /* wrote out ! row1 */
-
-        /* fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb) */
-        /* this can be done "destructive" */
-        "pxor %%mm2,%%mm2\n"           /*  0  0  0  0  0  0  0  0 */
-        "punpckhbw %%mm2,%%mm6\n"      /*  0 R3  0 R2  0 R1  0 R0 */
-        "punpckhbw %%mm1,%%mm5\n"      /* G3 B3 G2 B2 G1 B1 G0 B0 */
-        "movq %%mm5,%%mm1\n"           /* G3 B3 G2 B2 G1 B1 G0 B0 */
-        "punpcklwd %%mm6,%%mm1\n"      /*  0 R1 G1 B1  0 R0 G0 B0 */
-        "movq %%mm1,(%5)\n"          /* wrote out ! row2 */
-        "punpckhwd %%mm6,%%mm5\n"      /*  0 R3 G3 B3  0 R2 G2 B2 */
-        "movq %%mm5,8(%5)\n"         /* wrote out ! row2 */
-
-        "addl $4,%2\n"            /* lum+4 */
-        "leal 16(%3),%3\n"        /* row1+16 */
-        "leal 16(%5),%5\n"        /* row2+16 */
-        "addl $2,%0\n"        /* cr+2 */
-        "addl $2,%1\n"           /* cb+2 */
-
-        "addl $4,%6\n"            /* x+4 */
-        "cmpl %4,%6\n"
-
-        "jl 1b\n"
-        "addl %4,%2\n" /* lum += cols */
-        "addl %8,%3\n" /* row1+= mod */
-        "addl %8,%5\n" /* row2+= mod */
-        "movl $0,%6\n" /* x=0 */
-        "cmpl %7,%2\n"
-        "jl 1b\n"
-
-        "emms\n"  /* reset MMX registers. */
-        :
-        : "r" (cr), "r"(cb),"r"(lum),
-          "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod),
-          "m"(MMX_0080w),"m"(MMX_VgrnRGB),"m"(MMX_VredRGB),
-          "m"(MMX_FF00w),"m"(MMX_00FFw),"m"(MMX_UgrnRGB),
-          "m"(MMX_UbluRGB)
-    );
-}
-
-void Color565DitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix,
-                             unsigned char *lum, unsigned char *cr,
-                             unsigned char *cb, unsigned char *out,
-                             int rows, int cols, int mod )
-{
-    Uint16 *row1;
-    Uint16 *row2;
-
-    unsigned char* y = lum +cols*rows;    /* Pointer to the end */
-    int x = 0;
-    row1 = (Uint16 *)out;                 /* 16 bit target */
-    row2 = (Uint16 *)out+cols+mod;        /* start of second row  */
-    mod = (mod+cols+mod)*2;               /* increment for row1 in byte */
-
-    __asm__ __volatile__(
-        ".align 8\n"
-        "1:\n"
-
-        "movd           (%1),                   %%mm0\n" /* 4 Cb         0  0  0  0 u3 u2 u1 u0 */
-        "pxor           %%mm7,                  %%mm7\n"
-        "movd (%0), %%mm1\n"   /* 4 Cr                0  0  0  0 v3 v2 v1 v0 */
-
-        "punpcklbw      %%mm7,                  %%mm0\n" /* 4 W cb   0 u3  0 u2  0 u1  0 u0 */
-        "punpcklbw      %%mm7,                  %%mm1\n" /* 4 W cr   0 v3  0 v2  0 v1  0 v0 */
-        "psubw          %9,                     %%mm0\n"
-        "psubw          %9,                     %%mm1\n"
-        "movq           %%mm0,                  %%mm2\n" /* Cb                   0 u3  0 u2  0 u1  0 u0 */
-        "movq           %%mm1,                  %%mm3\n" /* Cr */
-        "pmullw         %10,                    %%mm2\n" /* Cb2green 0 R3  0 R2  0 R1  0 R0 */
-        "movq           (%2),                   %%mm6\n" /* L1      l7 L6 L5 L4 L3 L2 L1 L0 */
-        "pmullw         %11,                    %%mm0\n" /* Cb2blue */
-        "pand           %12,                    %%mm6\n" /* L1      00 L6 00 L4 00 L2 00 L0 */
-        "pmullw         %13,                    %%mm3\n" /* Cr2green */
-        "movq           (%2),                   %%mm7\n" /* L2 */
-        "pmullw         %14,                    %%mm1\n" /* Cr2red */
-        "psrlw          $8,                     %%mm7\n"        /* L2           00 L7 00 L5 00 L3 00 L1 */
-        "pmullw         %15,                    %%mm6\n" /* lum1 */
-        "paddw          %%mm3,                  %%mm2\n" /* Cb2green + Cr2green == green */
-        "pmullw         %15,                    %%mm7\n" /* lum2 */
-
-        "movq           %%mm6,                  %%mm4\n" /* lum1 */
-        "paddw          %%mm0,                  %%mm6\n" /* lum1 +blue 00 B6 00 B4 00 B2 00 B0 */
-        "movq           %%mm4,                  %%mm5\n" /* lum1 */
-        "paddw          %%mm1,                  %%mm4\n" /* lum1 +red  00 R6 00 R4 00 R2 00 R0 */
-        "paddw          %%mm2,                  %%mm5\n" /* lum1 +green 00 G6 00 G4 00 G2 00 G0 */
-        "psraw          $6,                     %%mm4\n" /* R1 0 .. 64 */
-        "movq           %%mm7,                  %%mm3\n" /* lum2                       00 L7 00 L5 00 L3 00 L1 */
-        "psraw          $6,                     %%mm5\n" /* G1  - .. + */
-        "paddw          %%mm0,                  %%mm7\n" /* Lum2 +blue 00 B7 00 B5 00 B3 00 B1 */
-        "psraw          $6,                     %%mm6\n" /* B1         0 .. 64 */
-        "packuswb       %%mm4,                  %%mm4\n" /* R1 R1 */
-        "packuswb       %%mm5,                  %%mm5\n" /* G1 G1 */
-        "packuswb       %%mm6,                  %%mm6\n" /* B1 B1 */
-        "punpcklbw      %%mm4,                  %%mm4\n"
-        "punpcklbw      %%mm5,                  %%mm5\n"
-
-        "pand           %16,                    %%mm4\n"
-        "psllw          $3,                     %%mm5\n" /* GREEN       1 */
-        "punpcklbw      %%mm6,                  %%mm6\n"
-        "pand           %17,                    %%mm5\n"
-        "pand           %16,                    %%mm6\n"
-        "por            %%mm5,                  %%mm4\n" /* */
-        "psrlw          $11,                    %%mm6\n" /* BLUE        1 */
-        "movq           %%mm3,                  %%mm5\n" /* lum2 */
-        "paddw          %%mm1,                  %%mm3\n" /* lum2 +red      00 R7 00 R5 00 R3 00 R1 */
-        "paddw          %%mm2,                  %%mm5\n" /* lum2 +green 00 G7 00 G5 00 G3 00 G1 */
-        "psraw          $6,                     %%mm3\n" /* R2 */
-        "por            %%mm6,                  %%mm4\n" /* MM4 */
-        "psraw          $6,                     %%mm5\n" /* G2 */
-        "movq           (%2, %4),               %%mm6\n" /* L3 load lum2 */
-        "psraw          $6,                     %%mm7\n"
-        "packuswb       %%mm3,                  %%mm3\n"
-        "packuswb       %%mm5,                  %%mm5\n"
-        "packuswb       %%mm7,                  %%mm7\n"
-        "pand           %12,                    %%mm6\n" /* L3 */
-        "punpcklbw      %%mm3,                  %%mm3\n"
-        "punpcklbw      %%mm5,                  %%mm5\n"
-        "pmullw         %15,                    %%mm6\n" /* lum3 */
-        "punpcklbw      %%mm7,                  %%mm7\n"
-        "psllw          $3,                     %%mm5\n" /* GREEN 2 */
-        "pand           %16,                    %%mm7\n"
-        "pand           %16,                    %%mm3\n"
-        "psrlw          $11,                    %%mm7\n" /* BLUE  2 */
-        "pand           %17,                    %%mm5\n"
-        "por            %%mm7,                  %%mm3\n"
-        "movq           (%2,%4),                %%mm7\n" /* L4 load lum2 */
-        "por            %%mm5,                  %%mm3\n"
-        "psrlw          $8,                     %%mm7\n" /* L4 */
-        "movq           %%mm4,                  %%mm5\n"
-        "punpcklwd      %%mm3,                  %%mm4\n"
-        "pmullw         %15,                    %%mm7\n" /* lum4 */
-        "punpckhwd      %%mm3,                  %%mm5\n"
-
-        "movq           %%mm4,                  (%3)\n"  /* write row1 */
-        "movq           %%mm5,                  8(%3)\n" /* write row1 */
-
-        "movq           %%mm6,                  %%mm4\n" /* Lum3 */
-        "paddw          %%mm0,                  %%mm6\n" /* Lum3 +blue */
-
-        "movq           %%mm4,                  %%mm5\n" /* Lum3 */
-        "paddw          %%mm1,                  %%mm4\n" /* Lum3 +red */
-        "paddw          %%mm2,                  %%mm5\n" /* Lum3 +green */
-        "psraw          $6,                     %%mm4\n"
-        "movq           %%mm7,                  %%mm3\n" /* Lum4 */
-        "psraw          $6,                     %%mm5\n"
-        "paddw          %%mm0,                  %%mm7\n" /* Lum4 +blue */
-        "psraw          $6,                     %%mm6\n" /* Lum3 +blue */
-        "movq           %%mm3,                  %%mm0\n" /* Lum4 */
-        "packuswb       %%mm4,                  %%mm4\n"
-        "paddw          %%mm1,                  %%mm3\n" /* Lum4 +red */
-        "packuswb       %%mm5,                  %%mm5\n"
-        "paddw          %%mm2,                  %%mm0\n" /* Lum4 +green */
-        "packuswb       %%mm6,                  %%mm6\n"
-        "punpcklbw      %%mm4,                  %%mm4\n"
-        "punpcklbw      %%mm5,                  %%mm5\n"
-        "punpcklbw      %%mm6,                  %%mm6\n"
-        "psllw          $3,                     %%mm5\n" /* GREEN 3 */
-        "pand           %16,                    %%mm4\n"
-        "psraw          $6,                     %%mm3\n" /* psr 6 */
-        "psraw          $6,                     %%mm0\n"
-        "pand           %16,                    %%mm6\n" /* BLUE */
-        "pand           %17,                    %%mm5\n"
-        "psrlw          $11,                    %%mm6\n" /* BLUE  3 */
-        "por            %%mm5,                  %%mm4\n"
-        "psraw          $6,                     %%mm7\n"
-        "por            %%mm6,                  %%mm4\n"
-        "packuswb       %%mm3,                  %%mm3\n"
-        "packuswb       %%mm0,                  %%mm0\n"
-        "packuswb       %%mm7,                  %%mm7\n"
-        "punpcklbw      %%mm3,                  %%mm3\n"
-        "punpcklbw      %%mm0,                  %%mm0\n"
-        "punpcklbw      %%mm7,                  %%mm7\n"
-        "pand           %16,                    %%mm3\n"
-        "pand           %16,                    %%mm7\n" /* BLUE */
-        "psllw          $3,                     %%mm0\n" /* GREEN 4 */
-        "psrlw          $11,                    %%mm7\n"
-        "pand           %17,                    %%mm0\n"
-        "por            %%mm7,                  %%mm3\n"
-        "por            %%mm0,                  %%mm3\n"
-
-        "movq           %%mm4,                  %%mm5\n"
-
-        "punpcklwd      %%mm3,                  %%mm4\n"
-        "punpckhwd      %%mm3,                  %%mm5\n"
-
-        "movq           %%mm4,                  (%5)\n"
-        "movq           %%mm5,                  8(%5)\n"
-
-        "addl           $8,                     %6\n"
-        "addl           $8,                     %2\n"
-        "addl           $4,                     %0\n"
-        "addl           $4,                     %1\n"
-        "cmpl           %4,                     %6\n"
-        "leal           16(%3),                 %3\n"
-        "leal           16(%5),%5\n" /* row2+16 */
-
-        "jl             1b\n"
-        "addl           %4,     %2\n" /* lum += cols */
-        "addl           %8,     %3\n" /* row1+= mod */
-        "addl           %8,     %5\n" /* row2+= mod */
-        "movl           $0,     %6\n" /* x=0 */
-        "cmpl           %7,     %2\n"
-        "jl             1b\n"
-        "emms\n"
-        :
-        : "r" (cr), "r"(cb),"r"(lum),
-          "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod),
-          "m"(MMX_0080w),"m"(MMX_Ugrn565),"m"(MMX_Ublu5x5),
-          "m"(MMX_00FFw),"m"(MMX_Vgrn565),"m"(MMX_Vred5x5),
-          "m"(MMX_Ycoeff),"m"(MMX_red565),"m"(MMX_grn565)
-    );
-}
-
-/* *INDENT-ON* */
-
-#endif /* USE_MMX_ASSEMBLY */
-
-/* vi: set ts=4 sw=4 expandtab: */
--- a/src/render/SDL_yuv_sw.c
+++ b/src/render/SDL_yuv_sw.c
--- a/src/render/SDL_yuv_sw_c.h
+++ b/src/render/SDL_yuv_sw_c.h
@ -30,16 +30,6 @@ struct SDL_SW_YUVTexture
    Uint32 target_format;
    int w, h;
    Uint8 *pixels;
-    int *colortab;
-    Uint32 *rgb_2_pix;
-    void (*Display1X) (int *colortab, Uint32 * rgb_2_pix,
-                       unsigned char *lum, unsigned char *cr,
-                       unsigned char *cb, unsigned char *out,
-                       int rows, int cols, int mod);
-    void (*Display2X) (int *colortab, Uint32 * rgb_2_pix,
-                       unsigned char *lum, unsigned char *cr,
-                       unsigned char *cb, unsigned char *out,
-                       int rows, int cols, int mod);

    /* These are just so we don't have to allocate them separately */
    Uint16 pitches[3];
--- a/src/render/direct3d/SDL_render_d3d.c
+++ b/src/render/direct3d/SDL_render_d3d.c
@ -39,85 +39,7 @@
 #include <d3d9.h>
 #endif

-
-#ifdef ASSEMBLE_SHADER
-#pragma comment(lib, "d3dx9.lib")
-
-/**************************************************************************
- * ID3DXBuffer:
- * ------------
- * The buffer object is used by D3DX to return arbitrary size data.
- *
- * GetBufferPointer -
- *    Returns a pointer to the beginning of the buffer.
- *
- * GetBufferSize -
- *    Returns the size of the buffer, in bytes.
- **************************************************************************/
-
-typedef interface ID3DXBuffer ID3DXBuffer;
-typedef interface ID3DXBuffer *LPD3DXBUFFER;
-
-/* {8BA5FB08-5195-40e2-AC58-0D989C3A0102} */
-DEFINE_GUID(IID_ID3DXBuffer,
-0x8ba5fb08, 0x5195, 0x40e2, 0xac, 0x58, 0xd, 0x98, 0x9c, 0x3a, 0x1, 0x2);
-
-#undef INTERFACE
-#define INTERFACE ID3DXBuffer
-
-typedef interface ID3DXBuffer {
-    const struct ID3DXBufferVtbl FAR* lpVtbl;
-} ID3DXBuffer;
-typedef const struct ID3DXBufferVtbl ID3DXBufferVtbl;
-const struct ID3DXBufferVtbl
-{
-    /* IUnknown */
-    STDMETHOD(QueryInterface)(THIS_ REFIID iid, LPVOID *ppv) PURE;
-    STDMETHOD_(ULONG, AddRef)(THIS) PURE;
-    STDMETHOD_(ULONG, Release)(THIS) PURE;
-
-    /* ID3DXBuffer */
-    STDMETHOD_(LPVOID, GetBufferPointer)(THIS) PURE;
-    STDMETHOD_(DWORD, GetBufferSize)(THIS) PURE;
-};
-
-HRESULT WINAPI
-    D3DXAssembleShader(
-        LPCSTR                          pSrcData,
-        UINT                            SrcDataLen,
-        CONST LPVOID*                   pDefines,
-        LPVOID                          pInclude,
-        DWORD                           Flags,
-        LPD3DXBUFFER*                   ppShader,
-        LPD3DXBUFFER*                   ppErrorMsgs);
-
-static void PrintShaderData(LPDWORD shader_data, DWORD shader_size)
-{
-    OutputDebugStringA("const DWORD shader_data[] = {\n\t");
-    {
-        SDL_bool newline = SDL_FALSE;
-        unsigned i;
-        for (i = 0; i < shader_size / sizeof(DWORD); ++i) {
-            char dword[11];
-            if (i > 0) {
-                if ((i%6) == 0) {
-                    newline = SDL_TRUE;
-                }
-                if (newline) {
-                    OutputDebugStringA(",\n    ");
-                    newline = SDL_FALSE;
-                } else {
-                    OutputDebugStringA(", ");
-                }
-            }
-            SDL_snprintf(dword, sizeof(dword), "0x%8.8x", shader_data[i]);
-            OutputDebugStringA(dword);
-        }
-        OutputDebugStringA("\n};\n");
-    }
-}
-
-#endif /* ASSEMBLE_SHADER */
+#include "SDL_shaders_d3d.h"


 /* Direct3D renderer implementation */
@ -188,7 +110,7 @@ typedef struct
    IDirect3DSurface9 *defaultRenderTarget;
    IDirect3DSurface9 *currentRenderTarget;
    void* d3dxDLL;
-    LPDIRECT3DPIXELSHADER9 ps_yuv;
+    LPDIRECT3DPIXELSHADER9 shaders[NUM_SHADERS];
 } D3D_RenderData;

 typedef struct
@ -197,6 +119,7 @@ typedef struct
    int w, h;
    DWORD usage;
    Uint32 format;
+    D3DFORMAT d3dfmt;
    IDirect3DTexture9 *texture;
    IDirect3DTexture9 *staging;
 } D3D_TextureRep;
@ -313,6 +236,8 @@ PixelFormatToD3DFMT(Uint32 format)
        return D3DFMT_A8R8G8B8;
    case SDL_PIXELFORMAT_YV12:
    case SDL_PIXELFORMAT_IYUV:
+    case SDL_PIXELFORMAT_NV12:
+    case SDL_PIXELFORMAT_NV21:
        return D3DFMT_L8;
    default:
        return D3DFMT_UNKNOWN;
@ -661,137 +586,19 @@ D3D_CreateRenderer(SDL_Window * window, Uint32 flags)
    /* Set up parameters for rendering */
    D3D_InitRenderState(data);

-    if (caps.MaxSimultaneousTextures >= 3)
-    {
-#ifdef ASSEMBLE_SHADER
-        /* This shader was created by running the following HLSL through the fxc compiler
-           and then tuning the generated assembly.
-
-           fxc /T fx_4_0 /O3 /Gfa /Fc yuv.fxc yuv.fx
-
-           --- yuv.fx ---
-           Texture2D g_txY;
-           Texture2D g_txU;
-           Texture2D g_txV;
-
-           SamplerState samLinear
-           {
-               Filter = ANISOTROPIC;
-               AddressU = Clamp;
-               AddressV = Clamp;
-               MaxAnisotropy = 1;
-           };
-
-           struct VS_OUTPUT
-           {
-                float2 TextureUV  : TEXCOORD0;
-           };
-
-           struct PS_OUTPUT
-           {
-                float4 RGBAColor : SV_Target;
-           };
-
-           PS_OUTPUT YUV420( VS_OUTPUT In ) 
-           {
-               const float3 offset = {-0.0627451017, -0.501960814, -0.501960814};
-               const float3 Rcoeff = {1.164,  0.000,  1.596};
-               const float3 Gcoeff = {1.164, -0.391, -0.813};
-               const float3 Bcoeff = {1.164,  2.018,  0.000};
-
-               PS_OUTPUT Output;
-               float2 TextureUV = In.TextureUV;
-
-               float3 yuv;
-               yuv.x = g_txY.Sample( samLinear, TextureUV ).r;
-               yuv.y = g_txU.Sample( samLinear, TextureUV ).r;
-               yuv.z = g_txV.Sample( samLinear, TextureUV ).r;
-
-               yuv += offset;
-               Output.RGBAColor.r = dot(yuv, Rcoeff);
-               Output.RGBAColor.g = dot(yuv, Gcoeff);
-               Output.RGBAColor.b = dot(yuv, Bcoeff);
-               Output.RGBAColor.a = 1.0f;
-
-               return Output;
-           }
-
-           technique10 RenderYUV420
-           {
-               pass P0
-               {
-                    SetPixelShader( CompileShader( ps_4_0_level_9_0, YUV420() ) );
-               }
-           }
-        */
-        const char *shader_text =
-            "ps_2_0\n"
-            "def c0, -0.0627451017, -0.501960814, -0.501960814, 1\n"
-            "def c1, 1.16400003, 0, 1.59599996, 0\n"
-            "def c2, 1.16400003, -0.391000003, -0.813000023, 0\n"
-            "def c3, 1.16400003, 2.01799989, 0, 0\n"
-            "dcl t0.xy\n"
-            "dcl v0.xyzw\n"
-            "dcl_2d s0\n"
-            "dcl_2d s1\n"
-            "dcl_2d s2\n"
-            "texld r0, t0, s0\n"
-            "texld r1, t0, s1\n"
-            "texld r2, t0, s2\n"
-            "mov r0.y, r1.x\n"
-            "mov r0.z, r2.x\n"
-            "add r0.xyz, r0, c0\n"
-            "dp3 r1.x, r0, c1\n"
-            "dp3 r1.y, r0, c2\n"
-            "dp2add r1.z, r0, c3, c3.z\n"   /* Logically this is "dp3 r1.z, r0, c3" but the optimizer did its magic */
-            "mov r1.w, c0.w\n"
-            "mul r0, r1, v0\n"              /* Not in the HLSL, multiply by vertex color */
-            "mov oC0, r0\n"
-        ;
-        LPD3DXBUFFER pCode;
-        LPD3DXBUFFER pErrorMsgs;
-        LPDWORD shader_data = NULL;
-        DWORD   shader_size = 0;
-        result = D3DXAssembleShader(shader_text, SDL_strlen(shader_text), NULL, NULL, 0, &pCode, &pErrorMsgs);
-        if (!FAILED(result)) {
-            shader_data = (DWORD*)pCode->lpVtbl->GetBufferPointer(pCode);
-            shader_size = pCode->lpVtbl->GetBufferSize(pCode);
-            PrintShaderData(shader_data, shader_size);
-        } else {
-            const char *error = (const char *)pErrorMsgs->lpVtbl->GetBufferPointer(pErrorMsgs);
-            SDL_SetError("Couldn't assemble shader: %s", error);
-        }
-        if (shader_data != NULL)
-#else
-        const DWORD shader_data[] = {
-            0xffff0200, 0x05000051, 0xa00f0000, 0xbd808081, 0xbf008081, 0xbf008081,
-            0x3f800000, 0x05000051, 0xa00f0001, 0x3f94fdf4, 0x00000000, 0x3fcc49ba,
-            0x00000000, 0x05000051, 0xa00f0002, 0x3f94fdf4, 0xbec83127, 0xbf5020c5,
-            0x00000000, 0x05000051, 0xa00f0003, 0x3f94fdf4, 0x400126e9, 0x00000000,
-            0x00000000, 0x0200001f, 0x80000000, 0xb0030000, 0x0200001f, 0x80000000,
-            0x900f0000, 0x0200001f, 0x90000000, 0xa00f0800, 0x0200001f, 0x90000000,
-            0xa00f0801, 0x0200001f, 0x90000000, 0xa00f0802, 0x03000042, 0x800f0000,
-            0xb0e40000, 0xa0e40800, 0x03000042, 0x800f0001, 0xb0e40000, 0xa0e40801,
-            0x03000042, 0x800f0002, 0xb0e40000, 0xa0e40802, 0x02000001, 0x80020000,
-            0x80000001, 0x02000001, 0x80040000, 0x80000002, 0x03000002, 0x80070000,
-            0x80e40000, 0xa0e40000, 0x03000008, 0x80010001, 0x80e40000, 0xa0e40001,
-            0x03000008, 0x80020001, 0x80e40000, 0xa0e40002, 0x0400005a, 0x80040001,
-            0x80e40000, 0xa0e40003, 0xa0aa0003, 0x02000001, 0x80080001, 0xa0ff0000,
-            0x03000005, 0x800f0000, 0x80e40001, 0x90e40000, 0x02000001, 0x800f0800,
-            0x80e40000, 0x0000ffff
-        };
-#endif
-        {
-            result = IDirect3DDevice9_CreatePixelShader(data->device, shader_data, &data->ps_yuv);
-            if (!FAILED(result)) {
-                renderer->info.texture_formats[renderer->info.num_texture_formats++] = SDL_PIXELFORMAT_YV12;
-                renderer->info.texture_formats[renderer->info.num_texture_formats++] = SDL_PIXELFORMAT_IYUV;
-            } else {
+    if (caps.MaxSimultaneousTextures >= 3) {
+        int i;
+        for (i = 0; i < SDL_arraysize(data->shaders); ++i) {
+            result = D3D9_CreatePixelShader(data->device, (D3D9_Shader)i, &data->shaders[i]);
+            if (FAILED(result)) {
                D3D_SetError("CreatePixelShader()", result);
            }
        }
+        if (data->shaders[SHADER_YUV_JPEG] && data->shaders[SHADER_YUV_BT601] && data->shaders[SHADER_YUV_BT709]) {
+            renderer->info.texture_formats[renderer->info.num_texture_formats++] = SDL_PIXELFORMAT_YV12;
+            renderer->info.texture_formats[renderer->info.num_texture_formats++] = SDL_PIXELFORMAT_IYUV;
+        }
    }
-
    return renderer;
 }

@ -870,7 +677,7 @@ GetScaleQuality(void)
 }

 static int
-D3D_CreateTextureRep(IDirect3DDevice9 *device, D3D_TextureRep *texture, DWORD usage, Uint32 format, int w, int h)
+D3D_CreateTextureRep(IDirect3DDevice9 *device, D3D_TextureRep *texture, DWORD usage, Uint32 format, D3DFORMAT d3dfmt, int w, int h)
 {
    HRESULT result;

@ -879,6 +686,7 @@ D3D_CreateTextureRep(IDirect3DDevice9 *device, D3D_TextureRep *texture, DWORD us
    texture->h = h;
    texture->usage = usage;
    texture->format = format;
+    texture->d3dfmt = d3dfmt;

    result = IDirect3DDevice9_CreateTexture(device, w, h, 1, usage,
        PixelFormatToD3DFMT(format),
@ -897,8 +705,7 @@ D3D_CreateStagingTexture(IDirect3DDevice9 *device, D3D_TextureRep *texture)

    if (texture->staging == NULL) {
        result = IDirect3DDevice9_CreateTexture(device, texture->w, texture->h, 1, 0,
-            PixelFormatToD3DFMT(texture->format),
-            D3DPOOL_SYSTEMMEM, &texture->staging, NULL);
+            texture->d3dfmt, D3DPOOL_SYSTEMMEM, &texture->staging, NULL);
        if (FAILED(result)) {
            return D3D_SetError("CreateTexture(D3DPOOL_SYSTEMMEM)", result);
        }
@ -934,7 +741,7 @@ D3D_BindTextureRep(IDirect3DDevice9 *device, D3D_TextureRep *texture, DWORD samp
 }

 static int
-D3D_RecreateTextureRep(IDirect3DDevice9 *device, D3D_TextureRep *texture, Uint32 format, int w, int h)
+D3D_RecreateTextureRep(IDirect3DDevice9 *device, D3D_TextureRep *texture)
 {
    if (texture->texture) {
        IDirect3DTexture9_Release(texture->texture);
@ -948,7 +755,7 @@ D3D_RecreateTextureRep(IDirect3DDevice9 *device, D3D_TextureRep *texture, Uint32
 }

 static int
-D3D_UpdateTextureRep(IDirect3DDevice9 *device, D3D_TextureRep *texture, Uint32 format, int x, int y, int w, int h, const void *pixels, int pitch)
+D3D_UpdateTextureRep(IDirect3DDevice9 *device, D3D_TextureRep *texture, int x, int y, int w, int h, const void *pixels, int pitch)
 {
    RECT d3drect;
    D3DLOCKED_RECT locked;
@ -972,8 +779,8 @@ D3D_UpdateTextureRep(IDirect3DDevice9 *device, D3D_TextureRep *texture, Uint32 f
    }

    src = (const Uint8 *)pixels;
-    dst = locked.pBits;
-    length = w * SDL_BYTESPERPIXEL(format);
+    dst = (Uint8 *)locked.pBits;
+    length = w * SDL_BYTESPERPIXEL(texture->format);
    if (length == pitch && length == locked.Pitch) {
        SDL_memcpy(dst, src, length*h);
    } else {
@ -1032,7 +839,7 @@ D3D_CreateTexture(SDL_Renderer * renderer, SDL_Texture * texture)
        usage = 0;
    }

-    if (D3D_CreateTextureRep(data->device, &texturedata->texture, usage, texture->format, texture->w, texture->h) < 0) {
+    if (D3D_CreateTextureRep(data->device, &texturedata->texture, usage, texture->format, PixelFormatToD3DFMT(texture->format), texture->w, texture->h) < 0) {
        return -1;
    }

@ -1040,11 +847,11 @@ D3D_CreateTexture(SDL_Renderer * renderer, SDL_Texture * texture)
        texture->format == SDL_PIXELFORMAT_IYUV) {
        texturedata->yuv = SDL_TRUE;

-        if (D3D_CreateTextureRep(data->device, &texturedata->utexture, usage, texture->format, texture->w / 2, texture->h / 2) < 0) {
+        if (D3D_CreateTextureRep(data->device, &texturedata->utexture, usage, texture->format, PixelFormatToD3DFMT(texture->format), (texture->w + 1) / 2, (texture->h + 1) / 2) < 0) {
            return -1;
        }

-        if (D3D_CreateTextureRep(data->device, &texturedata->vtexture, usage, texture->format, texture->w / 2, texture->h / 2) < 0) {
+        if (D3D_CreateTextureRep(data->device, &texturedata->vtexture, usage, texture->format, PixelFormatToD3DFMT(texture->format), (texture->w + 1) / 2, (texture->h + 1) / 2) < 0) {
            return -1;
        }
    }
@ -1061,16 +868,16 @@ D3D_RecreateTexture(SDL_Renderer * renderer, SDL_Texture * texture)
        return 0;
    }

-    if (D3D_RecreateTextureRep(data->device, &texturedata->texture, texture->format, texture->w, texture->h) < 0) {
+    if (D3D_RecreateTextureRep(data->device, &texturedata->texture) < 0) {
        return -1;
    }

    if (texturedata->yuv) {
-        if (D3D_RecreateTextureRep(data->device, &texturedata->utexture, texture->format, texture->w / 2, texture->h / 2) < 0) {
+        if (D3D_RecreateTextureRep(data->device, &texturedata->utexture) < 0) {
            return -1;
        }

-        if (D3D_RecreateTextureRep(data->device, &texturedata->vtexture, texture->format, texture->w / 2, texture->h / 2) < 0) {
+        if (D3D_RecreateTextureRep(data->device, &texturedata->vtexture) < 0) {
            return -1;
        }
    }
@ -1089,7 +896,7 @@ D3D_UpdateTexture(SDL_Renderer * renderer, SDL_Texture * texture,
        return -1;
    }

-    if (D3D_UpdateTextureRep(data->device, &texturedata->texture, texture->format, rect->x, rect->y, rect->w, rect->h, pixels, pitch) < 0) {
+    if (D3D_UpdateTextureRep(data->device, &texturedata->texture, rect->x, rect->y, rect->w, rect->h, pixels, pitch) < 0) {
        return -1;
    }

@ -1097,13 +904,13 @@ D3D_UpdateTexture(SDL_Renderer * renderer, SDL_Texture * texture,
        /* Skip to the correct offset into the next texture */
        pixels = (const void*)((const Uint8*)pixels + rect->h * pitch);

-        if (D3D_UpdateTextureRep(data->device, texture->format == SDL_PIXELFORMAT_YV12 ? &texturedata->vtexture : &texturedata->utexture, texture->format, rect->x / 2, rect->y / 2, rect->w / 2, rect->h / 2, pixels, pitch / 2) < 0) {
+        if (D3D_UpdateTextureRep(data->device, texture->format == SDL_PIXELFORMAT_YV12 ? &texturedata->vtexture : &texturedata->utexture, rect->x / 2, rect->y / 2, (rect->w + 1) / 2, (rect->h + 1) / 2, pixels, (pitch + 1) / 2) < 0) {
            return -1;
        }

        /* Skip to the correct offset into the next texture */
-        pixels = (const void*)((const Uint8*)pixels + (rect->h * pitch)/4);
-        if (D3D_UpdateTextureRep(data->device, texture->format == SDL_PIXELFORMAT_YV12 ? &texturedata->utexture : &texturedata->vtexture, texture->format, rect->x / 2, rect->y / 2, rect->w / 2, rect->h / 2, pixels, pitch / 2) < 0) {
+        pixels = (const void*)((const Uint8*)pixels + ((rect->h + 1) / 2) * ((pitch + 1) / 2));
+        if (D3D_UpdateTextureRep(data->device, texture->format == SDL_PIXELFORMAT_YV12 ? &texturedata->utexture : &texturedata->vtexture, rect->x / 2, (rect->y + 1) / 2, (rect->w + 1) / 2, (rect->h + 1) / 2, pixels, (pitch + 1) / 2) < 0) {
            return -1;
        }
    }
@ -1125,13 +932,13 @@ D3D_UpdateTextureYUV(SDL_Renderer * renderer, SDL_Texture * texture,
        return -1;
    }

-    if (D3D_UpdateTextureRep(data->device, &texturedata->texture, texture->format, rect->x, rect->y, rect->w, rect->h, Yplane, Ypitch) < 0) {
+    if (D3D_UpdateTextureRep(data->device, &texturedata->texture, rect->x, rect->y, rect->w, rect->h, Yplane, Ypitch) < 0) {
        return -1;
    }
-    if (D3D_UpdateTextureRep(data->device, &texturedata->utexture, texture->format, rect->x / 2, rect->y / 2, rect->w / 2, rect->h / 2, Uplane, Upitch) < 0) {
+    if (D3D_UpdateTextureRep(data->device, &texturedata->utexture, rect->x / 2, rect->y / 2, (rect->w + 1) / 2, (rect->h + 1) / 2, Uplane, Upitch) < 0) {
        return -1;
    }
-    if (D3D_UpdateTextureRep(data->device, &texturedata->vtexture, texture->format, rect->x / 2, rect->y / 2, rect->w / 2, rect->h / 2, Vplane, Vpitch) < 0) {
+    if (D3D_UpdateTextureRep(data->device, &texturedata->vtexture, rect->x / 2, rect->y / 2, (rect->w + 1) / 2, (rect->h + 1) / 2, Vplane, Vpitch) < 0) {
        return -1;
    }
    return 0;
@ -1609,13 +1416,60 @@ D3D_UpdateTextureScaleMode(D3D_RenderData *data, D3D_TextureData *texturedata, u
    }
 }

+static int
+D3D_RenderSetupTextureState(SDL_Renderer * renderer, SDL_Texture * texture, LPDIRECT3DPIXELSHADER9 *shader)
+{
+    D3D_RenderData *data = (D3D_RenderData *) renderer->driverdata;
+    D3D_TextureData *texturedata;
+
+    *shader = NULL;
+
+    texturedata = (D3D_TextureData *)texture->driverdata;
+    if (!texturedata) {
+        SDL_SetError("Texture is not currently available");
+        return -1;
+    }
+
+    D3D_UpdateTextureScaleMode(data, texturedata, 0);
+
+    if (D3D_BindTextureRep(data->device, &texturedata->texture, 0) < 0) {
+        return -1;
+    }
+
+    if (texturedata->yuv) {
+        switch (SDL_GetYUVConversionModeForResolution(texture->w, texture->h)) {
+        case SDL_YUV_CONVERSION_JPEG:
+            *shader = data->shaders[SHADER_YUV_JPEG];
+            break;
+        case SDL_YUV_CONVERSION_BT601:
+            *shader = data->shaders[SHADER_YUV_BT601];
+            break;
+        case SDL_YUV_CONVERSION_BT709:
+            *shader = data->shaders[SHADER_YUV_BT709];
+            break;
+        default:
+            return SDL_SetError("Unsupported YUV conversion mode");
+        }
+
+        D3D_UpdateTextureScaleMode(data, texturedata, 1);
+        D3D_UpdateTextureScaleMode(data, texturedata, 2);
+
+        if (D3D_BindTextureRep(data->device, &texturedata->utexture, 1) < 0) {
+            return -1;
+        }
+        if (D3D_BindTextureRep(data->device, &texturedata->vtexture, 2) < 0) {
+            return -1;
+        }
+    }
+    return 0;
+}
+
 static int
 D3D_RenderCopy(SDL_Renderer * renderer, SDL_Texture * texture,
               const SDL_Rect * srcrect, const SDL_FRect * dstrect)
 {
    D3D_RenderData *data = (D3D_RenderData *) renderer->driverdata;
-    D3D_TextureData *texturedata;
-    LPDIRECT3DPIXELSHADER9 shader = NULL;
+    LPDIRECT3DPIXELSHADER9 shader;
    float minx, miny, maxx, maxy;
    float minu, maxu, minv, maxv;
    DWORD color;
@ -1626,12 +1480,6 @@ D3D_RenderCopy(SDL_Renderer * renderer, SDL_Texture * texture,
        return -1;
    }

-    texturedata = (D3D_TextureData *)texture->driverdata;
-    if (!texturedata) {
-        SDL_SetError("Texture is not currently available");
-        return -1;
-    }
-
    minx = dstrect->x - 0.5f;
    miny = dstrect->y - 0.5f;
    maxx = dstrect->x + dstrect->w - 0.5f;
@ -1674,45 +1522,25 @@ D3D_RenderCopy(SDL_Renderer * renderer, SDL_Texture * texture,

    D3D_SetBlendMode(data, texture->blendMode);

-    D3D_UpdateTextureScaleMode(data, texturedata, 0);
-
-    if (D3D_BindTextureRep(data->device, &texturedata->texture, 0) < 0) {
+    if (D3D_RenderSetupTextureState(renderer, texture, &shader) < 0) {
        return -1;
    }
-
-    if (texturedata->yuv) {
-        shader = data->ps_yuv;
-
-        D3D_UpdateTextureScaleMode(data, texturedata, 1);
-        D3D_UpdateTextureScaleMode(data, texturedata, 2);
-
-        if (D3D_BindTextureRep(data->device, &texturedata->utexture, 1) < 0) {
-            return -1;
-        }
-        if (D3D_BindTextureRep(data->device, &texturedata->vtexture, 2) < 0) {
-            return -1;
-        }
-    }
-
+    
    if (shader) {
        result = IDirect3DDevice9_SetPixelShader(data->device, shader);
        if (FAILED(result)) {
            return D3D_SetError("SetShader()", result);
        }
    }
-    result =
-        IDirect3DDevice9_DrawPrimitiveUP(data->device, D3DPT_TRIANGLEFAN, 2,
-                                         vertices, sizeof(*vertices));
+    result = IDirect3DDevice9_DrawPrimitiveUP(data->device, D3DPT_TRIANGLEFAN, 2,
+                                              vertices, sizeof(*vertices));
    if (FAILED(result)) {
-        return D3D_SetError("DrawPrimitiveUP()", result);
+        D3D_SetError("DrawPrimitiveUP()", result);
    }
    if (shader) {
-        result = IDirect3DDevice9_SetPixelShader(data->device, NULL);
-        if (FAILED(result)) {
-            return D3D_SetError("SetShader()", result);
-        }
+        IDirect3DDevice9_SetPixelShader(data->device, NULL);
    }
-    return 0;
+    return FAILED(result) ? -1 : 0;
 }


@ -1722,7 +1550,6 @@ D3D_RenderCopyEx(SDL_Renderer * renderer, SDL_Texture * texture,
               const double angle, const SDL_FPoint * center, const SDL_RendererFlip flip)
 {
    D3D_RenderData *data = (D3D_RenderData *) renderer->driverdata;
-    D3D_TextureData *texturedata;
    LPDIRECT3DPIXELSHADER9 shader = NULL;
    float minx, miny, maxx, maxy;
    float minu, maxu, minv, maxv;
@ -1736,12 +1563,6 @@ D3D_RenderCopyEx(SDL_Renderer * renderer, SDL_Texture * texture,
        return -1;
    }

-    texturedata = (D3D_TextureData *)texture->driverdata;
-    if (!texturedata) {
-        SDL_SetError("Texture is not currently available");
-        return -1;
-    }
-
    centerx = center->x;
    centery = center->y;

@ -1798,54 +1619,37 @@ D3D_RenderCopyEx(SDL_Renderer * renderer, SDL_Texture * texture,

    D3D_SetBlendMode(data, texture->blendMode);

+    if (D3D_RenderSetupTextureState(renderer, texture, &shader) < 0) {
+        return -1;
+    }
+
    /* Rotate and translate */
    modelMatrix = MatrixMultiply(
            MatrixRotationZ((float)(M_PI * (float) angle / 180.0f)),
            MatrixTranslation(dstrect->x + center->x - 0.5f, dstrect->y + center->y - 0.5f, 0));
    IDirect3DDevice9_SetTransform(data->device, D3DTS_VIEW, (D3DMATRIX*)&modelMatrix);
-
-    D3D_UpdateTextureScaleMode(data, texturedata, 0);
-
-    if (D3D_BindTextureRep(data->device, &texturedata->texture, 0) < 0) {
-        return -1;
-    }
-
-    if (texturedata->yuv) {
-        shader = data->ps_yuv;
-
-        D3D_UpdateTextureScaleMode(data, texturedata, 1);
-        D3D_UpdateTextureScaleMode(data, texturedata, 2);
-        
-        if (D3D_BindTextureRep(data->device, &texturedata->utexture, 1) < 0) {
-            return -1;
-        }
-        if (D3D_BindTextureRep(data->device, &texturedata->vtexture, 2) < 0) {
-            return -1;
-        }
-    }
-
+    
    if (shader) {
        result = IDirect3DDevice9_SetPixelShader(data->device, shader);
        if (FAILED(result)) {
-            return D3D_SetError("SetShader()", result);
+            D3D_SetError("SetShader()", result);
+            goto done;
        }
    }
-    result =
-        IDirect3DDevice9_DrawPrimitiveUP(data->device, D3DPT_TRIANGLEFAN, 2,
-                                         vertices, sizeof(*vertices));
+    result = IDirect3DDevice9_DrawPrimitiveUP(data->device, D3DPT_TRIANGLEFAN, 2,
+                                              vertices, sizeof(*vertices));
    if (FAILED(result)) {
-        return D3D_SetError("DrawPrimitiveUP()", result);
+        D3D_SetError("DrawPrimitiveUP()", result);
    }
+done:
    if (shader) {
-        result = IDirect3DDevice9_SetPixelShader(data->device, NULL);
-        if (FAILED(result)) {
-            return D3D_SetError("SetShader()", result);
-        }
+        IDirect3DDevice9_SetPixelShader(data->device, NULL);
    }

    modelMatrix = MatrixIdentity();
    IDirect3DDevice9_SetTransform(data->device, D3DTS_VIEW, (D3DMATRIX*)&modelMatrix);
-    return 0;
+
+    return FAILED(result) ? -1 : 0;
 }

 static int
@ -1955,6 +1759,8 @@ D3D_DestroyRenderer(SDL_Renderer * renderer)
    D3D_RenderData *data = (D3D_RenderData *) renderer->driverdata;

    if (data) {
+        int i;
+
        /* Release the render target */
        if (data->defaultRenderTarget) {
            IDirect3DSurface9_Release(data->defaultRenderTarget);
@ -1964,11 +1770,15 @@ D3D_DestroyRenderer(SDL_Renderer * renderer)
            IDirect3DSurface9_Release(data->currentRenderTarget);
            data->currentRenderTarget = NULL;
        }
-        if (data->ps_yuv) {
-            IDirect3DPixelShader9_Release(data->ps_yuv);
+        for (i = 0; i < SDL_arraysize(data->shaders); ++i) {
+            if (data->shaders[i]) {
+                IDirect3DPixelShader9_Release(data->shaders[i]);
+                data->shaders[i] = NULL;
+            }
        }
        if (data->device) {
            IDirect3DDevice9_Release(data->device);
+            data->device = NULL;
        }
        if (data->d3d) {
            IDirect3D9_Release(data->d3d);
--- a/src/render/direct3d/SDL_shaders_d3d.c
+++ b/src/render/direct3d/SDL_shaders_d3d.c
@ -0,0 +1,274 @@
+/*
+  Simple DirectMedia Layer
+  Copyright (C) 1997-2017 Sam Lantinga <slouken@libsdl.org>
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+#include "../../SDL_internal.h"
+
+#include "SDL_render.h"
+#include "SDL_system.h"
+
+#if SDL_VIDEO_RENDER_D3D && !SDL_RENDER_DISABLED
+
+#include "../../core/windows/SDL_windows.h"
+
+#include <d3d9.h>
+
+#include "SDL_shaders_d3d.h"
+
+/* The shaders here were compiled with:
+
+       fxc /T ps_2_0 /Fo"<OUTPUT FILE>" "<INPUT FILE>"
+
+   Shader object code was converted to a list of DWORDs via the following
+   *nix style command (available separately from Windows + MSVC):
+
+     hexdump -v -e '6/4 "0x%08.8x, " "\n"' <FILE>
+*/
+
+/* --- D3D9_PixelShader_YUV_JPEG.hlsl ---
+    Texture2D theTextureY : register(t0);
+    Texture2D theTextureU : register(t1);
+    Texture2D theTextureV : register(t2);
+    SamplerState theSampler = sampler_state
+    {
+        addressU = Clamp;
+        addressV = Clamp;
+        mipfilter = NONE;
+        minfilter = LINEAR;
+        magfilter = LINEAR;
+    };
+
+    struct PixelShaderInput
+    {
+        float4 pos : SV_POSITION;
+        float2 tex : TEXCOORD0;
+        float4 color : COLOR0;
+    };
+
+    float4 main(PixelShaderInput input) : SV_TARGET
+    {
+        const float3 offset = {0.0, -0.501960814, -0.501960814};
+        const float3 Rcoeff = {1.0000,  0.0000,  1.4020};
+        const float3 Gcoeff = {1.0000, -0.3441, -0.7141};
+        const float3 Bcoeff = {1.0000,  1.7720,  0.0000};
+
+        float4 Output;
+
+        float3 yuv;
+        yuv.x = theTextureY.Sample(theSampler, input.tex).r;
+        yuv.y = theTextureU.Sample(theSampler, input.tex).r;
+        yuv.z = theTextureV.Sample(theSampler, input.tex).r;
+
+        yuv += offset;
+        Output.r = dot(yuv, Rcoeff);
+        Output.g = dot(yuv, Gcoeff);
+        Output.b = dot(yuv, Bcoeff);
+        Output.a = 1.0f;
+
+        return Output * input.color;
+    }
+*/
+static const DWORD D3D9_PixelShader_YUV_JPEG[] = {
+    0xffff0200, 0x0044fffe, 0x42415443, 0x0000001c, 0x000000d7, 0xffff0200,
+    0x00000003, 0x0000001c, 0x00000100, 0x000000d0, 0x00000058, 0x00010003,
+    0x00000001, 0x00000070, 0x00000000, 0x00000080, 0x00020003, 0x00000001,
+    0x00000098, 0x00000000, 0x000000a8, 0x00000003, 0x00000001, 0x000000c0,
+    0x00000000, 0x53656874, 0x6c706d61, 0x742b7265, 0x65546568, 0x72757478,
+    0xab005565, 0x00070004, 0x00040001, 0x00000001, 0x00000000, 0x53656874,
+    0x6c706d61, 0x742b7265, 0x65546568, 0x72757478, 0xab005665, 0x00070004,
+    0x00040001, 0x00000001, 0x00000000, 0x53656874, 0x6c706d61, 0x742b7265,
+    0x65546568, 0x72757478, 0xab005965, 0x00070004, 0x00040001, 0x00000001,
+    0x00000000, 0x325f7370, 0x4d00305f, 0x6f726369, 0x74666f73, 0x29522820,
+    0x534c4820, 0x6853204c, 0x72656461, 0x6d6f4320, 0x656c6970, 0x2e362072,
+    0x36392e33, 0x312e3030, 0x34383336, 0xababab00, 0x05000051, 0xa00f0000,
+    0x00000000, 0xbf008081, 0xbf008081, 0x3f800000, 0x05000051, 0xa00f0001,
+    0x3f800000, 0x00000000, 0x3fb374bc, 0x00000000, 0x05000051, 0xa00f0002,
+    0x3f800000, 0xbeb02de0, 0xbf36cf42, 0x00000000, 0x05000051, 0xa00f0003,
+    0x3f800000, 0x3fe2d0e5, 0x00000000, 0x00000000, 0x0200001f, 0x80000000,
+    0xb0030000, 0x0200001f, 0x80000000, 0x900f0000, 0x0200001f, 0x90000000,
+    0xa00f0800, 0x0200001f, 0x90000000, 0xa00f0801, 0x0200001f, 0x90000000,
+    0xa00f0802, 0x03000042, 0x800f0000, 0xb0e40000, 0xa0e40800, 0x03000042,
+    0x800f0001, 0xb0e40000, 0xa0e40801, 0x03000042, 0x800f0002, 0xb0e40000,
+    0xa0e40802, 0x02000001, 0x80020000, 0x80000001, 0x02000001, 0x80040000,
+    0x80000002, 0x03000002, 0x80070000, 0x80e40000, 0xa0e40000, 0x03000008,
+    0x80010001, 0x80e40000, 0xa0e40001, 0x03000008, 0x80020001, 0x80e40000,
+    0xa0e40002, 0x0400005a, 0x80040001, 0x80e40000, 0xa0e40003, 0xa0aa0003,
+    0x02000001, 0x80080001, 0xa0ff0000, 0x03000005, 0x800f0000, 0x80e40001,
+    0x90e40000, 0x02000001, 0x800f0800, 0x80e40000, 0x0000ffff
+};
+
+/* --- D3D9_PixelShader_YUV_BT601.hlsl ---
+    Texture2D theTextureY : register(t0);
+    Texture2D theTextureU : register(t1);
+    Texture2D theTextureV : register(t2);
+    SamplerState theSampler = sampler_state
+    {
+        addressU = Clamp;
+        addressV = Clamp;
+        mipfilter = NONE;
+        minfilter = LINEAR;
+        magfilter = LINEAR;
+    };
+
+    struct PixelShaderInput
+    {
+        float4 pos : SV_POSITION;
+        float2 tex : TEXCOORD0;
+        float4 color : COLOR0;
+    };
+
+    float4 main(PixelShaderInput input) : SV_TARGET
+    {
+        const float3 offset = {-0.0627451017, -0.501960814, -0.501960814};
+        const float3 Rcoeff = {1.1644,  0.0000,  1.5960};
+        const float3 Gcoeff = {1.1644, -0.3918, -0.8130};
+        const float3 Bcoeff = {1.1644,  2.0172,  0.0000};
+
+        float4 Output;
+
+        float3 yuv;
+        yuv.x = theTextureY.Sample(theSampler, input.tex).r;
+        yuv.y = theTextureU.Sample(theSampler, input.tex).r;
+        yuv.z = theTextureV.Sample(theSampler, input.tex).r;
+
+        yuv += offset;
+        Output.r = dot(yuv, Rcoeff);
+        Output.g = dot(yuv, Gcoeff);
+        Output.b = dot(yuv, Bcoeff);
+        Output.a = 1.0f;
+
+        return Output * input.color;
+    }
+*/
+static const DWORD D3D9_PixelShader_YUV_BT601[] = {
+    0xffff0200, 0x0044fffe, 0x42415443, 0x0000001c, 0x000000d7, 0xffff0200,
+    0x00000003, 0x0000001c, 0x00000100, 0x000000d0, 0x00000058, 0x00010003,
+    0x00000001, 0x00000070, 0x00000000, 0x00000080, 0x00020003, 0x00000001,
+    0x00000098, 0x00000000, 0x000000a8, 0x00000003, 0x00000001, 0x000000c0,
+    0x00000000, 0x53656874, 0x6c706d61, 0x742b7265, 0x65546568, 0x72757478,
+    0xab005565, 0x00070004, 0x00040001, 0x00000001, 0x00000000, 0x53656874,
+    0x6c706d61, 0x742b7265, 0x65546568, 0x72757478, 0xab005665, 0x00070004,
+    0x00040001, 0x00000001, 0x00000000, 0x53656874, 0x6c706d61, 0x742b7265,
+    0x65546568, 0x72757478, 0xab005965, 0x00070004, 0x00040001, 0x00000001,
+    0x00000000, 0x325f7370, 0x4d00305f, 0x6f726369, 0x74666f73, 0x29522820,
+    0x534c4820, 0x6853204c, 0x72656461, 0x6d6f4320, 0x656c6970, 0x2e362072,
+    0x36392e33, 0x312e3030, 0x34383336, 0xababab00, 0x05000051, 0xa00f0000,
+    0xbd808081, 0xbf008081, 0xbf008081, 0x3f800000, 0x05000051, 0xa00f0001,
+    0x3f950b0f, 0x00000000, 0x3fcc49ba, 0x00000000, 0x05000051, 0xa00f0002,
+    0x3f950b0f, 0xbec89a02, 0xbf5020c5, 0x00000000, 0x05000051, 0xa00f0003,
+    0x3f950b0f, 0x400119ce, 0x00000000, 0x00000000, 0x0200001f, 0x80000000,
+    0xb0030000, 0x0200001f, 0x80000000, 0x900f0000, 0x0200001f, 0x90000000,
+    0xa00f0800, 0x0200001f, 0x90000000, 0xa00f0801, 0x0200001f, 0x90000000,
+    0xa00f0802, 0x03000042, 0x800f0000, 0xb0e40000, 0xa0e40800, 0x03000042,
+    0x800f0001, 0xb0e40000, 0xa0e40801, 0x03000042, 0x800f0002, 0xb0e40000,
+    0xa0e40802, 0x02000001, 0x80020000, 0x80000001, 0x02000001, 0x80040000,
+    0x80000002, 0x03000002, 0x80070000, 0x80e40000, 0xa0e40000, 0x03000008,
+    0x80010001, 0x80e40000, 0xa0e40001, 0x03000008, 0x80020001, 0x80e40000,
+    0xa0e40002, 0x0400005a, 0x80040001, 0x80e40000, 0xa0e40003, 0xa0aa0003,
+    0x02000001, 0x80080001, 0xa0ff0000, 0x03000005, 0x800f0000, 0x80e40001,
+    0x90e40000, 0x02000001, 0x800f0800, 0x80e40000, 0x0000ffff
+};
+
+/* --- D3D9_PixelShader_YUV_BT709.hlsl ---
+    Texture2D theTextureY : register(t0);
+    Texture2D theTextureU : register(t1);
+    Texture2D theTextureV : register(t2);
+    SamplerState theSampler = sampler_state
+    {
+        addressU = Clamp;
+        addressV = Clamp;
+        mipfilter = NONE;
+        minfilter = LINEAR;
+        magfilter = LINEAR;
+    };
+
+    struct PixelShaderInput
+    {
+        float4 pos : SV_POSITION;
+        float2 tex : TEXCOORD0;
+        float4 color : COLOR0;
+    };
+
+    float4 main(PixelShaderInput input) : SV_TARGET
+    {
+        const float3 offset = {-0.0627451017, -0.501960814, -0.501960814};
+        const float3 Rcoeff = {1.1644,  0.0000,  1.7927};
+        const float3 Gcoeff = {1.1644, -0.2132, -0.5329};
+        const float3 Bcoeff = {1.1644,  2.1124,  0.0000};
+
+        float4 Output;
+
+        float3 yuv;
+        yuv.x = theTextureY.Sample(theSampler, input.tex).r;
+        yuv.y = theTextureU.Sample(theSampler, input.tex).r;
+        yuv.z = theTextureV.Sample(theSampler, input.tex).r;
+
+        yuv += offset;
+        Output.r = dot(yuv, Rcoeff);
+        Output.g = dot(yuv, Gcoeff);
+        Output.b = dot(yuv, Bcoeff);
+        Output.a = 1.0f;
+
+        return Output * input.color;
+    }
+*/
+static const DWORD D3D9_PixelShader_YUV_BT709[] = {
+    0xffff0200, 0x0044fffe, 0x42415443, 0x0000001c, 0x000000d7, 0xffff0200,
+    0x00000003, 0x0000001c, 0x00000100, 0x000000d0, 0x00000058, 0x00010003,
+    0x00000001, 0x00000070, 0x00000000, 0x00000080, 0x00020003, 0x00000001,
+    0x00000098, 0x00000000, 0x000000a8, 0x00000003, 0x00000001, 0x000000c0,
+    0x00000000, 0x53656874, 0x6c706d61, 0x742b7265, 0x65546568, 0x72757478,
+    0xab005565, 0x00070004, 0x00040001, 0x00000001, 0x00000000, 0x53656874,
+    0x6c706d61, 0x742b7265, 0x65546568, 0x72757478, 0xab005665, 0x00070004,
+    0x00040001, 0x00000001, 0x00000000, 0x53656874, 0x6c706d61, 0x742b7265,
+    0x65546568, 0x72757478, 0xab005965, 0x00070004, 0x00040001, 0x00000001,
+    0x00000000, 0x325f7370, 0x4d00305f, 0x6f726369, 0x74666f73, 0x29522820,
+    0x534c4820, 0x6853204c, 0x72656461, 0x6d6f4320, 0x656c6970, 0x2e362072,
+    0x36392e33, 0x312e3030, 0x34383336, 0xababab00, 0x05000051, 0xa00f0000,
+    0xbd808081, 0xbf008081, 0xbf008081, 0x3f800000, 0x05000051, 0xa00f0001,
+    0x3f950b0f, 0x00000000, 0x3fe57732, 0x00000000, 0x05000051, 0xa00f0002,
+    0x3f950b0f, 0xbe5a511a, 0xbf086c22, 0x00000000, 0x05000051, 0xa00f0003,
+    0x3f950b0f, 0x40073190, 0x00000000, 0x00000000, 0x0200001f, 0x80000000,
+    0xb0030000, 0x0200001f, 0x80000000, 0x900f0000, 0x0200001f, 0x90000000,
+    0xa00f0800, 0x0200001f, 0x90000000, 0xa00f0801, 0x0200001f, 0x90000000,
+    0xa00f0802, 0x03000042, 0x800f0000, 0xb0e40000, 0xa0e40800, 0x03000042,
+    0x800f0001, 0xb0e40000, 0xa0e40801, 0x03000042, 0x800f0002, 0xb0e40000,
+    0xa0e40802, 0x02000001, 0x80020000, 0x80000001, 0x02000001, 0x80040000,
+    0x80000002, 0x03000002, 0x80070000, 0x80e40000, 0xa0e40000, 0x03000008,
+    0x80010001, 0x80e40000, 0xa0e40001, 0x03000008, 0x80020001, 0x80e40000,
+    0xa0e40002, 0x0400005a, 0x80040001, 0x80e40000, 0xa0e40003, 0xa0aa0003,
+    0x02000001, 0x80080001, 0xa0ff0000, 0x03000005, 0x800f0000, 0x80e40001,
+    0x90e40000, 0x02000001, 0x800f0800, 0x80e40000, 0x0000ffff
+};
+
+
+static const DWORD *D3D9_shaders[] = {
+    D3D9_PixelShader_YUV_JPEG,
+    D3D9_PixelShader_YUV_BT601,
+    D3D9_PixelShader_YUV_BT709,
+};
+
+HRESULT D3D9_CreatePixelShader(IDirect3DDevice9 *d3dDevice, D3D9_Shader shader, IDirect3DPixelShader9 **pixelShader)
+{
+    return IDirect3DDevice9_CreatePixelShader(d3dDevice, D3D9_shaders[shader], pixelShader);
+}
+
+#endif /* SDL_VIDEO_RENDER_D3D && !SDL_RENDER_DISABLED */
+
+/* vi: set ts=4 sw=4 expandtab: */
--- a/src/render/direct3d/SDL_shaders_d3d.h
+++ b/src/render/direct3d/SDL_shaders_d3d.h
@ -18,11 +18,17 @@
     misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution.
 */
-#include "../SDL_internal.h"
+#include "../../SDL_internal.h"

-/* FIXME: This breaks on various versions of GCC and should be rewritten using intrinsics */
-#if 0 /* (__GNUC__ > 2) && defined(__i386__) && __OPTIMIZE__ && SDL_ASSEMBLY_ROUTINES && !defined(__clang__) */
-#define USE_MMX_ASSEMBLY 1
-#endif
+/* D3D9 shader implementation */
+
+typedef enum {
+    SHADER_YUV_JPEG,
+    SHADER_YUV_BT601,
+    SHADER_YUV_BT709,
+    NUM_SHADERS
+} D3D9_Shader;
+
+extern HRESULT D3D9_CreatePixelShader(IDirect3DDevice9 *d3dDevice, D3D9_Shader shader, IDirect3DPixelShader9 **pixelShader);

 /* vi: set ts=4 sw=4 expandtab: */
--- a/src/render/direct3d11/SDL_render_d3d11.c
+++ b/src/render/direct3d11/SDL_render_d3d11.c
--- a/src/render/direct3d11/SDL_shaders_d3d11.c
+++ b/src/render/direct3d11/SDL_shaders_d3d11.c
--- a/src/render/direct3d11/SDL_shaders_d3d11.h
+++ b/src/render/direct3d11/SDL_shaders_d3d11.h
@ -0,0 +1,43 @@
+/*
+  Simple DirectMedia Layer
+  Copyright (C) 1997-2017 Sam Lantinga <slouken@libsdl.org>
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+#include "../../SDL_internal.h"
+
+/* D3D11 shader implementation */
+
+typedef enum {
+    SHADER_SOLID,
+    SHADER_RGB,
+    SHADER_YUV_JPEG,
+    SHADER_YUV_BT601,
+    SHADER_YUV_BT709,
+    SHADER_NV12_JPEG,
+    SHADER_NV12_BT601,
+    SHADER_NV12_BT709,
+    SHADER_NV21_JPEG,
+    SHADER_NV21_BT601,
+    SHADER_NV21_BT709,
+    NUM_SHADERS
+} D3D11_Shader;
+
+extern int D3D11_CreateVertexShader(ID3D11Device1 *d3dDevice, ID3D11VertexShader **vertexShader, ID3D11InputLayout **inputLayout);
+extern int D3D11_CreatePixelShader(ID3D11Device1 *d3dDevice, D3D11_Shader shader, ID3D11PixelShader **pixelShader);
+
+/* vi: set ts=4 sw=4 expandtab: */
--- a/src/render/mmx.h
+++ b/src/render/mmx.h
@ -1,642 +0,0 @@
-/*	mmx.h
-
-	MultiMedia eXtensions GCC interface library for IA32.
-
-	To use this library, simply include this header file
-	and compile with GCC.  You MUST have inlining enabled
-	in order for mmx_ok() to work; this can be done by
-	simply using -O on the GCC command line.
-
-	Compiling with -DMMX_TRACE will cause detailed trace
-	output to be sent to stderr for each mmx operation.
-	This adds lots of code, and obviously slows execution to
-	a crawl, but can be very useful for debugging.
-
-	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
-	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
-	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
-	AND FITNESS FOR ANY PARTICULAR PURPOSE.
-
-	1997-99 by H. Dietz and R. Fisher
-
- Notes:
-	It appears that the latest gas has the pand problem fixed, therefore
-	  I'll undefine BROKEN_PAND by default.
-*/
-
-#ifndef _MMX_H
-#define _MMX_H
-
-
-/*	Warning:  at this writing, the version of GAS packaged
-	with most Linux distributions does not handle the
-	parallel AND operation mnemonic correctly.  If the
-	symbol BROKEN_PAND is defined, a slower alternative
-	coding will be used.  If execution of mmxtest results
-	in an illegal instruction fault, define this symbol.
-*/
-#undef	BROKEN_PAND
-
-
-/*	The type of an value that fits in an MMX register
-	(note that long long constant values MUST be suffixed
-	 by LL and unsigned long long values by ULL, lest
-	 they be truncated by the compiler)
-*/
-typedef union
-{
-    long long q;                /* Quadword (64-bit) value */
-    unsigned long long uq;      /* Unsigned Quadword */
-    int d[2];                   /* 2 Doubleword (32-bit) values */
-    unsigned int ud[2];         /* 2 Unsigned Doubleword */
-    short w[4];                 /* 4 Word (16-bit) values */
-    unsigned short uw[4];       /* 4 Unsigned Word */
-    char b[8];                  /* 8 Byte (8-bit) values */
-    unsigned char ub[8];        /* 8 Unsigned Byte */
-    float s[2];                 /* Single-precision (32-bit) value */
-} __attribute__ ((aligned(8))) mmx_t;   /* On an 8-byte (64-bit) boundary */
-
-
-#if 0
-/*	Function to test if multimedia instructions are supported...
-*/
-inline extern int
-mm_support(void)
-{
-    /* Returns 1 if MMX instructions are supported,
-       3 if Cyrix MMX and Extended MMX instructions are supported
-       5 if AMD MMX and 3DNow! instructions are supported
-       0 if hardware does not support any of these
-     */
-    register int rval = 0;
-
-    __asm__ __volatile__(
-                            /* See if CPUID instruction is supported ... */
-                            /* ... Get copies of EFLAGS into eax and ecx */
-                            "pushf\n\t"
-                            "popl %%eax\n\t" "movl %%eax, %%ecx\n\t"
-                            /* ... Toggle the ID bit in one copy and store */
-                            /*     to the EFLAGS reg */
-                            "xorl $0x200000, %%eax\n\t"
-                            "push %%eax\n\t" "popf\n\t"
-                            /* ... Get the (hopefully modified) EFLAGS */
-                            "pushf\n\t" "popl %%eax\n\t"
-                            /* ... Compare and test result */
-                            "xorl %%eax, %%ecx\n\t" "testl $0x200000, %%ecx\n\t" "jz NotSupported1\n\t" /* CPUID not supported */
-                            /* Get standard CPUID information, and
-                               go to a specific vendor section */
-                            "movl $0, %%eax\n\t" "cpuid\n\t"
-                            /* Check for Intel */
-                            "cmpl $0x756e6547, %%ebx\n\t"
-                            "jne TryAMD\n\t"
-                            "cmpl $0x49656e69, %%edx\n\t"
-                            "jne TryAMD\n\t"
-                            "cmpl $0x6c65746e, %%ecx\n"
-                            "jne TryAMD\n\t" "jmp Intel\n\t"
-                            /* Check for AMD */
-                            "\nTryAMD:\n\t"
-                            "cmpl $0x68747541, %%ebx\n\t"
-                            "jne TryCyrix\n\t"
-                            "cmpl $0x69746e65, %%edx\n\t"
-                            "jne TryCyrix\n\t"
-                            "cmpl $0x444d4163, %%ecx\n"
-                            "jne TryCyrix\n\t" "jmp AMD\n\t"
-                            /* Check for Cyrix */
-                            "\nTryCyrix:\n\t"
-                            "cmpl $0x69727943, %%ebx\n\t"
-                            "jne NotSupported2\n\t"
-                            "cmpl $0x736e4978, %%edx\n\t"
-                            "jne NotSupported3\n\t"
-                            "cmpl $0x64616574, %%ecx\n\t"
-                            "jne NotSupported4\n\t"
-                            /* Drop through to Cyrix... */
-                            /* Cyrix Section */
-                            /* See if extended CPUID level 80000001 is supported */
-                            /* The value of CPUID/80000001 for the 6x86MX is undefined
-                               according to the Cyrix CPU Detection Guide (Preliminary
-                               Rev. 1.01 table 1), so we'll check the value of eax for
-                               CPUID/0 to see if standard CPUID level 2 is supported.
-                               According to the table, the only CPU which supports level
-                               2 is also the only one which supports extended CPUID levels.
-                             */
-                            "cmpl $0x2, %%eax\n\t" "jne MMXtest\n\t"    /* Use standard CPUID instead */
-                            /* Extended CPUID supported (in theory), so get extended
-                               features */
-                            "movl $0x80000001, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%eax\n\t"    /* Test for MMX */
-                            "jz NotSupported5\n\t"      /* MMX not supported */
-                            "testl $0x01000000, %%eax\n\t"      /* Test for Ext'd MMX */
-                            "jnz EMMXSupported\n\t" "movl $1, %0:\n\n\t"        /* MMX Supported */
-                            "jmp Return\n\n" "EMMXSupported:\n\t" "movl $3, %0:\n\n\t"  /* EMMX and MMX Supported */
-                            "jmp Return\n\t"
-                            /* AMD Section */
-                            "AMD:\n\t"
-                            /* See if extended CPUID is supported */
-                            "movl $0x80000000, %%eax\n\t" "cpuid\n\t" "cmpl $0x80000000, %%eax\n\t" "jl MMXtest\n\t"    /* Use standard CPUID instead */
-                            /* Extended CPUID supported, so get extended features */
-                            "movl $0x80000001, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%edx\n\t"    /* Test for MMX */
-                            "jz NotSupported6\n\t"      /* MMX not supported */
-                            "testl $0x80000000, %%edx\n\t"      /* Test for 3DNow! */
-                            "jnz ThreeDNowSupported\n\t" "movl $1, %0:\n\n\t"   /* MMX Supported */
-                            "jmp Return\n\n" "ThreeDNowSupported:\n\t" "movl $5, %0:\n\n\t"     /* 3DNow! and MMX Supported */
-                            "jmp Return\n\t"
-                            /* Intel Section */
-                            "Intel:\n\t"
-                            /* Check for MMX */
-                            "MMXtest:\n\t" "movl $1, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%edx\n\t"      /* Test for MMX */
-                            "jz NotSupported7\n\t"      /* MMX Not supported */
-                            "movl $1, %0:\n\n\t"        /* MMX Supported */
-                            "jmp Return\n\t"
-                            /* Nothing supported */
-                            "\nNotSupported1:\n\t" "#movl $101, %0:\n\n\t" "\nNotSupported2:\n\t" "#movl $102, %0:\n\n\t" "\nNotSupported3:\n\t" "#movl $103, %0:\n\n\t" "\nNotSupported4:\n\t" "#movl $104, %0:\n\n\t" "\nNotSupported5:\n\t" "#movl $105, %0:\n\n\t" "\nNotSupported6:\n\t" "#movl $106, %0:\n\n\t" "\nNotSupported7:\n\t" "#movl $107, %0:\n\n\t" "movl $0, %0:\n\n\t" "Return:\n\t":"=a"(rval):     /* no input */
-                            :"eax", "ebx", "ecx", "edx");
-
-    /* Return */
-    return (rval);
-}
-
-/*	Function to test if mmx instructions are supported...
-*/
-inline extern int
-mmx_ok(void)
-{
-    /* Returns 1 if MMX instructions are supported, 0 otherwise */
-    return (mm_support() & 0x1);
-}
-#endif
-
-/*	Helper functions for the instruction macros that follow...
-	(note that memory-to-register, m2r, instructions are nearly
-	 as efficient as register-to-register, r2r, instructions;
-	 however, memory-to-memory instructions are really simulated
-	 as a convenience, and are only 1/3 as efficient)
-*/
-#ifdef	MMX_TRACE
-
-/*	Include the stuff for printing a trace to stderr...
-*/
-
-#define	mmx_i2r(op, imm, reg) \
-	{ \
-		mmx_t mmx_trace; \
-		mmx_trace.uq = (imm); \
-		printf(#op "_i2r(" #imm "=0x%08x%08x, ", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-		__asm__ __volatile__ ("movq %%" #reg ", %0" \
-				      : "=X" (mmx_trace) \
-				      : /* nothing */ ); \
-		printf(#reg "=0x%08x%08x) => ", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-		__asm__ __volatile__ (#op " %0, %%" #reg \
-				      : /* nothing */ \
-				      : "X" (imm)); \
-		__asm__ __volatile__ ("movq %%" #reg ", %0" \
-				      : "=X" (mmx_trace) \
-				      : /* nothing */ ); \
-		printf(#reg "=0x%08x%08x\n", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-	}
-
-#define	mmx_m2r(op, mem, reg) \
-	{ \
-		mmx_t mmx_trace; \
-		mmx_trace = (mem); \
-		printf(#op "_m2r(" #mem "=0x%08x%08x, ", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-		__asm__ __volatile__ ("movq %%" #reg ", %0" \
-				      : "=X" (mmx_trace) \
-				      : /* nothing */ ); \
-		printf(#reg "=0x%08x%08x) => ", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-		__asm__ __volatile__ (#op " %0, %%" #reg \
-				      : /* nothing */ \
-				      : "X" (mem)); \
-		__asm__ __volatile__ ("movq %%" #reg ", %0" \
-				      : "=X" (mmx_trace) \
-				      : /* nothing */ ); \
-		printf(#reg "=0x%08x%08x\n", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-	}
-
-#define	mmx_r2m(op, reg, mem) \
-	{ \
-		mmx_t mmx_trace; \
-		__asm__ __volatile__ ("movq %%" #reg ", %0" \
-				      : "=X" (mmx_trace) \
-				      : /* nothing */ ); \
-		printf(#op "_r2m(" #reg "=0x%08x%08x, ", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-		mmx_trace = (mem); \
-		printf(#mem "=0x%08x%08x) => ", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-		__asm__ __volatile__ (#op " %%" #reg ", %0" \
-				      : "=X" (mem) \
-				      : /* nothing */ ); \
-		mmx_trace = (mem); \
-		printf(#mem "=0x%08x%08x\n", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-	}
-
-#define	mmx_r2r(op, regs, regd) \
-	{ \
-		mmx_t mmx_trace; \
-		__asm__ __volatile__ ("movq %%" #regs ", %0" \
-				      : "=X" (mmx_trace) \
-				      : /* nothing */ ); \
-		printf(#op "_r2r(" #regs "=0x%08x%08x, ", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-		__asm__ __volatile__ ("movq %%" #regd ", %0" \
-				      : "=X" (mmx_trace) \
-				      : /* nothing */ ); \
-		printf(#regd "=0x%08x%08x) => ", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-		__asm__ __volatile__ (#op " %" #regs ", %" #regd); \
-		__asm__ __volatile__ ("movq %%" #regd ", %0" \
-				      : "=X" (mmx_trace) \
-				      : /* nothing */ ); \
-		printf(#regd "=0x%08x%08x\n", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-	}
-
-#define	mmx_m2m(op, mems, memd) \
-	{ \
-		mmx_t mmx_trace; \
-		mmx_trace = (mems); \
-		printf(#op "_m2m(" #mems "=0x%08x%08x, ", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-		mmx_trace = (memd); \
-		printf(#memd "=0x%08x%08x) => ", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-		__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
-				      #op " %1, %%mm0\n\t" \
-				      "movq %%mm0, %0" \
-				      : "=X" (memd) \
-				      : "X" (mems)); \
-		mmx_trace = (memd); \
-		printf(#memd "=0x%08x%08x\n", \
-			mmx_trace.d[1], mmx_trace.d[0]); \
-	}
-
-#else
-
-/*	These macros are a lot simpler without the tracing...
-*/
-
-#define	mmx_i2r(op, imm, reg) \
-	__asm__ __volatile__ (#op " %0, %%" #reg \
-			      : /* nothing */ \
-			      : "X" (imm) )
-
-#define	mmx_m2r(op, mem, reg) \
-	__asm__ __volatile__ (#op " %0, %%" #reg \
-			      : /* nothing */ \
-			      : "m" (mem))
-
-#define	mmx_r2m(op, reg, mem) \
-	__asm__ __volatile__ (#op " %%" #reg ", %0" \
-			      : "=m" (mem) \
-			      : /* nothing */ )
-
-#define	mmx_r2r(op, regs, regd) \
-	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
-
-#define	mmx_m2m(op, mems, memd) \
-	__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
-			      #op " %1, %%mm0\n\t" \
-			      "movq %%mm0, %0" \
-			      : "=X" (memd) \
-			      : "X" (mems))
-
-#endif
-
-
-/*	1x64 MOVe Quadword
-	(this is both a load and a store...
-	 in fact, it is the only way to store)
-*/
-#define	movq_m2r(var, reg)	mmx_m2r(movq, var, reg)
-#define	movq_r2m(reg, var)	mmx_r2m(movq, reg, var)
-#define	movq_r2r(regs, regd)	mmx_r2r(movq, regs, regd)
-#define	movq(vars, vard) \
-	__asm__ __volatile__ ("movq %1, %%mm0\n\t" \
-			      "movq %%mm0, %0" \
-			      : "=X" (vard) \
-			      : "X" (vars))
-
-
-/*	1x32 MOVe Doubleword
-	(like movq, this is both load and store...
-	 but is most useful for moving things between
-	 mmx registers and ordinary registers)
-*/
-#define	movd_m2r(var, reg)	mmx_m2r(movd, var, reg)
-#define	movd_r2m(reg, var)	mmx_r2m(movd, reg, var)
-#define	movd_r2r(regs, regd)	mmx_r2r(movd, regs, regd)
-#define	movd(vars, vard) \
-	__asm__ __volatile__ ("movd %1, %%mm0\n\t" \
-			      "movd %%mm0, %0" \
-			      : "=X" (vard) \
-			      : "X" (vars))
-
-
-/*	2x32, 4x16, and 8x8 Parallel ADDs
-*/
-#define	paddd_m2r(var, reg)	mmx_m2r(paddd, var, reg)
-#define	paddd_r2r(regs, regd)	mmx_r2r(paddd, regs, regd)
-#define	paddd(vars, vard)	mmx_m2m(paddd, vars, vard)
-
-#define	paddw_m2r(var, reg)	mmx_m2r(paddw, var, reg)
-#define	paddw_r2r(regs, regd)	mmx_r2r(paddw, regs, regd)
-#define	paddw(vars, vard)	mmx_m2m(paddw, vars, vard)
-
-#define	paddb_m2r(var, reg)	mmx_m2r(paddb, var, reg)
-#define	paddb_r2r(regs, regd)	mmx_r2r(paddb, regs, regd)
-#define	paddb(vars, vard)	mmx_m2m(paddb, vars, vard)
-
-
-/*	4x16 and 8x8 Parallel ADDs using Saturation arithmetic
-*/
-#define	paddsw_m2r(var, reg)	mmx_m2r(paddsw, var, reg)
-#define	paddsw_r2r(regs, regd)	mmx_r2r(paddsw, regs, regd)
-#define	paddsw(vars, vard)	mmx_m2m(paddsw, vars, vard)
-
-#define	paddsb_m2r(var, reg)	mmx_m2r(paddsb, var, reg)
-#define	paddsb_r2r(regs, regd)	mmx_r2r(paddsb, regs, regd)
-#define	paddsb(vars, vard)	mmx_m2m(paddsb, vars, vard)
-
-
-/*	4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic
-*/
-#define	paddusw_m2r(var, reg)	mmx_m2r(paddusw, var, reg)
-#define	paddusw_r2r(regs, regd)	mmx_r2r(paddusw, regs, regd)
-#define	paddusw(vars, vard)	mmx_m2m(paddusw, vars, vard)
-
-#define	paddusb_m2r(var, reg)	mmx_m2r(paddusb, var, reg)
-#define	paddusb_r2r(regs, regd)	mmx_r2r(paddusb, regs, regd)
-#define	paddusb(vars, vard)	mmx_m2m(paddusb, vars, vard)
-
-
-/*	2x32, 4x16, and 8x8 Parallel SUBs
-*/
-#define	psubd_m2r(var, reg)	mmx_m2r(psubd, var, reg)
-#define	psubd_r2r(regs, regd)	mmx_r2r(psubd, regs, regd)
-#define	psubd(vars, vard)	mmx_m2m(psubd, vars, vard)
-
-#define	psubw_m2r(var, reg)	mmx_m2r(psubw, var, reg)
-#define	psubw_r2r(regs, regd)	mmx_r2r(psubw, regs, regd)
-#define	psubw(vars, vard)	mmx_m2m(psubw, vars, vard)
-
-#define	psubb_m2r(var, reg)	mmx_m2r(psubb, var, reg)
-#define	psubb_r2r(regs, regd)	mmx_r2r(psubb, regs, regd)
-#define	psubb(vars, vard)	mmx_m2m(psubb, vars, vard)
-
-
-/*	4x16 and 8x8 Parallel SUBs using Saturation arithmetic
-*/
-#define	psubsw_m2r(var, reg)	mmx_m2r(psubsw, var, reg)
-#define	psubsw_r2r(regs, regd)	mmx_r2r(psubsw, regs, regd)
-#define	psubsw(vars, vard)	mmx_m2m(psubsw, vars, vard)
-
-#define	psubsb_m2r(var, reg)	mmx_m2r(psubsb, var, reg)
-#define	psubsb_r2r(regs, regd)	mmx_r2r(psubsb, regs, regd)
-#define	psubsb(vars, vard)	mmx_m2m(psubsb, vars, vard)
-
-
-/*	4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic
-*/
-#define	psubusw_m2r(var, reg)	mmx_m2r(psubusw, var, reg)
-#define	psubusw_r2r(regs, regd)	mmx_r2r(psubusw, regs, regd)
-#define	psubusw(vars, vard)	mmx_m2m(psubusw, vars, vard)
-
-#define	psubusb_m2r(var, reg)	mmx_m2r(psubusb, var, reg)
-#define	psubusb_r2r(regs, regd)	mmx_r2r(psubusb, regs, regd)
-#define	psubusb(vars, vard)	mmx_m2m(psubusb, vars, vard)
-
-
-/*	4x16 Parallel MULs giving Low 4x16 portions of results
-*/
-#define	pmullw_m2r(var, reg)	mmx_m2r(pmullw, var, reg)
-#define	pmullw_r2r(regs, regd)	mmx_r2r(pmullw, regs, regd)
-#define	pmullw(vars, vard)	mmx_m2m(pmullw, vars, vard)
-
-
-/*	4x16 Parallel MULs giving High 4x16 portions of results
-*/
-#define	pmulhw_m2r(var, reg)	mmx_m2r(pmulhw, var, reg)
-#define	pmulhw_r2r(regs, regd)	mmx_r2r(pmulhw, regs, regd)
-#define	pmulhw(vars, vard)	mmx_m2m(pmulhw, vars, vard)
-
-
-/*	4x16->2x32 Parallel Mul-ADD
-	(muls like pmullw, then adds adjacent 16-bit fields
-	 in the multiply result to make the final 2x32 result)
-*/
-#define	pmaddwd_m2r(var, reg)	mmx_m2r(pmaddwd, var, reg)
-#define	pmaddwd_r2r(regs, regd)	mmx_r2r(pmaddwd, regs, regd)
-#define	pmaddwd(vars, vard)	mmx_m2m(pmaddwd, vars, vard)
-
-
-/*	1x64 bitwise AND
-*/
-#ifdef	BROKEN_PAND
-#define	pand_m2r(var, reg) \
-	{ \
-		mmx_m2r(pandn, (mmx_t) -1LL, reg); \
-		mmx_m2r(pandn, var, reg); \
-	}
-#define	pand_r2r(regs, regd) \
-	{ \
-		mmx_m2r(pandn, (mmx_t) -1LL, regd); \
-		mmx_r2r(pandn, regs, regd) \
-	}
-#define	pand(vars, vard) \
-	{ \
-		movq_m2r(vard, mm0); \
-		mmx_m2r(pandn, (mmx_t) -1LL, mm0); \
-		mmx_m2r(pandn, vars, mm0); \
-		movq_r2m(mm0, vard); \
-	}
-#else
-#define	pand_m2r(var, reg)	mmx_m2r(pand, var, reg)
-#define	pand_r2r(regs, regd)	mmx_r2r(pand, regs, regd)
-#define	pand(vars, vard)	mmx_m2m(pand, vars, vard)
-#endif
-
-
-/*	1x64 bitwise AND with Not the destination
-*/
-#define	pandn_m2r(var, reg)	mmx_m2r(pandn, var, reg)
-#define	pandn_r2r(regs, regd)	mmx_r2r(pandn, regs, regd)
-#define	pandn(vars, vard)	mmx_m2m(pandn, vars, vard)
-
-
-/*	1x64 bitwise OR
-*/
-#define	por_m2r(var, reg)	mmx_m2r(por, var, reg)
-#define	por_r2r(regs, regd)	mmx_r2r(por, regs, regd)
-#define	por(vars, vard)	mmx_m2m(por, vars, vard)
-
-
-/*	1x64 bitwise eXclusive OR
-*/
-#define	pxor_m2r(var, reg)	mmx_m2r(pxor, var, reg)
-#define	pxor_r2r(regs, regd)	mmx_r2r(pxor, regs, regd)
-#define	pxor(vars, vard)	mmx_m2m(pxor, vars, vard)
-
-
-/*	2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
-	(resulting fields are either 0 or -1)
-*/
-#define	pcmpeqd_m2r(var, reg)	mmx_m2r(pcmpeqd, var, reg)
-#define	pcmpeqd_r2r(regs, regd)	mmx_r2r(pcmpeqd, regs, regd)
-#define	pcmpeqd(vars, vard)	mmx_m2m(pcmpeqd, vars, vard)
-
-#define	pcmpeqw_m2r(var, reg)	mmx_m2r(pcmpeqw, var, reg)
-#define	pcmpeqw_r2r(regs, regd)	mmx_r2r(pcmpeqw, regs, regd)
-#define	pcmpeqw(vars, vard)	mmx_m2m(pcmpeqw, vars, vard)
-
-#define	pcmpeqb_m2r(var, reg)	mmx_m2r(pcmpeqb, var, reg)
-#define	pcmpeqb_r2r(regs, regd)	mmx_r2r(pcmpeqb, regs, regd)
-#define	pcmpeqb(vars, vard)	mmx_m2m(pcmpeqb, vars, vard)
-
-
-/*	2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
-	(resulting fields are either 0 or -1)
-*/
-#define	pcmpgtd_m2r(var, reg)	mmx_m2r(pcmpgtd, var, reg)
-#define	pcmpgtd_r2r(regs, regd)	mmx_r2r(pcmpgtd, regs, regd)
-#define	pcmpgtd(vars, vard)	mmx_m2m(pcmpgtd, vars, vard)
-
-#define	pcmpgtw_m2r(var, reg)	mmx_m2r(pcmpgtw, var, reg)
-#define	pcmpgtw_r2r(regs, regd)	mmx_r2r(pcmpgtw, regs, regd)
-#define	pcmpgtw(vars, vard)	mmx_m2m(pcmpgtw, vars, vard)
-
-#define	pcmpgtb_m2r(var, reg)	mmx_m2r(pcmpgtb, var, reg)
-#define	pcmpgtb_r2r(regs, regd)	mmx_r2r(pcmpgtb, regs, regd)
-#define	pcmpgtb(vars, vard)	mmx_m2m(pcmpgtb, vars, vard)
-
-
-/*	1x64, 2x32, and 4x16 Parallel Shift Left Logical
-*/
-#define	psllq_i2r(imm, reg)	mmx_i2r(psllq, imm, reg)
-#define	psllq_m2r(var, reg)	mmx_m2r(psllq, var, reg)
-#define	psllq_r2r(regs, regd)	mmx_r2r(psllq, regs, regd)
-#define	psllq(vars, vard)	mmx_m2m(psllq, vars, vard)
-
-#define	pslld_i2r(imm, reg)	mmx_i2r(pslld, imm, reg)
-#define	pslld_m2r(var, reg)	mmx_m2r(pslld, var, reg)
-#define	pslld_r2r(regs, regd)	mmx_r2r(pslld, regs, regd)
-#define	pslld(vars, vard)	mmx_m2m(pslld, vars, vard)
-
-#define	psllw_i2r(imm, reg)	mmx_i2r(psllw, imm, reg)
-#define	psllw_m2r(var, reg)	mmx_m2r(psllw, var, reg)
-#define	psllw_r2r(regs, regd)	mmx_r2r(psllw, regs, regd)
-#define	psllw(vars, vard)	mmx_m2m(psllw, vars, vard)
-
-
-/*	1x64, 2x32, and 4x16 Parallel Shift Right Logical
-*/
-#define	psrlq_i2r(imm, reg)	mmx_i2r(psrlq, imm, reg)
-#define	psrlq_m2r(var, reg)	mmx_m2r(psrlq, var, reg)
-#define	psrlq_r2r(regs, regd)	mmx_r2r(psrlq, regs, regd)
-#define	psrlq(vars, vard)	mmx_m2m(psrlq, vars, vard)
-
-#define	psrld_i2r(imm, reg)	mmx_i2r(psrld, imm, reg)
-#define	psrld_m2r(var, reg)	mmx_m2r(psrld, var, reg)
-#define	psrld_r2r(regs, regd)	mmx_r2r(psrld, regs, regd)
-#define	psrld(vars, vard)	mmx_m2m(psrld, vars, vard)
-
-#define	psrlw_i2r(imm, reg)	mmx_i2r(psrlw, imm, reg)
-#define	psrlw_m2r(var, reg)	mmx_m2r(psrlw, var, reg)
-#define	psrlw_r2r(regs, regd)	mmx_r2r(psrlw, regs, regd)
-#define	psrlw(vars, vard)	mmx_m2m(psrlw, vars, vard)
-
-
-/*	2x32 and 4x16 Parallel Shift Right Arithmetic
-*/
-#define	psrad_i2r(imm, reg)	mmx_i2r(psrad, imm, reg)
-#define	psrad_m2r(var, reg)	mmx_m2r(psrad, var, reg)
-#define	psrad_r2r(regs, regd)	mmx_r2r(psrad, regs, regd)
-#define	psrad(vars, vard)	mmx_m2m(psrad, vars, vard)
-
-#define	psraw_i2r(imm, reg)	mmx_i2r(psraw, imm, reg)
-#define	psraw_m2r(var, reg)	mmx_m2r(psraw, var, reg)
-#define	psraw_r2r(regs, regd)	mmx_r2r(psraw, regs, regd)
-#define	psraw(vars, vard)	mmx_m2m(psraw, vars, vard)
-
-
-/*	2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
-	(packs source and dest fields into dest in that order)
-*/
-#define	packssdw_m2r(var, reg)	mmx_m2r(packssdw, var, reg)
-#define	packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd)
-#define	packssdw(vars, vard)	mmx_m2m(packssdw, vars, vard)
-
-#define	packsswb_m2r(var, reg)	mmx_m2r(packsswb, var, reg)
-#define	packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd)
-#define	packsswb(vars, vard)	mmx_m2m(packsswb, vars, vard)
-
-
-/*	4x16->8x8 PACK and Unsigned Saturate
-	(packs source and dest fields into dest in that order)
-*/
-#define	packuswb_m2r(var, reg)	mmx_m2r(packuswb, var, reg)
-#define	packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd)
-#define	packuswb(vars, vard)	mmx_m2m(packuswb, vars, vard)
-
-
-/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
-	(interleaves low half of dest with low half of source
-	 as padding in each result field)
-*/
-#define	punpckldq_m2r(var, reg)	mmx_m2r(punpckldq, var, reg)
-#define	punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd)
-#define	punpckldq(vars, vard)	mmx_m2m(punpckldq, vars, vard)
-
-#define	punpcklwd_m2r(var, reg)	mmx_m2r(punpcklwd, var, reg)
-#define	punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd)
-#define	punpcklwd(vars, vard)	mmx_m2m(punpcklwd, vars, vard)
-
-#define	punpcklbw_m2r(var, reg)	mmx_m2r(punpcklbw, var, reg)
-#define	punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd)
-#define	punpcklbw(vars, vard)	mmx_m2m(punpcklbw, vars, vard)
-
-
-/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
-	(interleaves high half of dest with high half of source
-	 as padding in each result field)
-*/
-#define	punpckhdq_m2r(var, reg)	mmx_m2r(punpckhdq, var, reg)
-#define	punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd)
-#define	punpckhdq(vars, vard)	mmx_m2m(punpckhdq, vars, vard)
-
-#define	punpckhwd_m2r(var, reg)	mmx_m2r(punpckhwd, var, reg)
-#define	punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd)
-#define	punpckhwd(vars, vard)	mmx_m2m(punpckhwd, vars, vard)
-
-#define	punpckhbw_m2r(var, reg)	mmx_m2r(punpckhbw, var, reg)
-#define	punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd)
-#define	punpckhbw(vars, vard)	mmx_m2m(punpckhbw, vars, vard)
-
-
-/*	Empty MMx State
-	(used to clean-up when going from mmx to float use
-	 of the registers that are shared by both; note that
-	 there is no float-to-mmx operation needed, because
-	 only the float tag word info is corruptible)
-*/
-#ifdef	MMX_TRACE
-
-#define	emms() \
-	{ \
-		printf("emms()\n"); \
-		__asm__ __volatile__ ("emms"); \
-	}
-
-#else
-
-#define	emms()			__asm__ __volatile__ ("emms")
-
-#endif
-
-#endif
-/* vi: set ts=4 sw=4 expandtab: */
--- a/src/render/opengl/SDL_render_gl.c
+++ b/src/render/opengl/SDL_render_gl.c
@ -1349,13 +1349,37 @@ GL_SetupCopy(SDL_Renderer * renderer, SDL_Texture * texture)

    GL_SetBlendMode(data, texture->blendMode);

-    if (texturedata->yuv) {
-        GL_SetShader(data, SHADER_YUV);
-    } else if (texturedata->nv12) {
-        if (texture->format == SDL_PIXELFORMAT_NV12) {
-            GL_SetShader(data, SHADER_NV12);
-        } else {
-            GL_SetShader(data, SHADER_NV21);
+    if (texturedata->yuv || texturedata->nv12) {
+        switch (SDL_GetYUVConversionModeForResolution(texture->w, texture->h)) {
+        case SDL_YUV_CONVERSION_JPEG:
+            if (texturedata->yuv) {
+                GL_SetShader(data, SHADER_YUV_JPEG);
+            } else if (texture->format == SDL_PIXELFORMAT_NV12) {
+                GL_SetShader(data, SHADER_NV12_JPEG);
+            } else {
+                GL_SetShader(data, SHADER_NV21_JPEG);
+            }
+            break;
+        case SDL_YUV_CONVERSION_BT601:
+            if (texturedata->yuv) {
+                GL_SetShader(data, SHADER_YUV_BT601);
+            } else if (texture->format == SDL_PIXELFORMAT_NV12) {
+                GL_SetShader(data, SHADER_NV12_BT601);
+            } else {
+                GL_SetShader(data, SHADER_NV21_BT601);
+            }
+            break;
+        case SDL_YUV_CONVERSION_BT709:
+            if (texturedata->yuv) {
+                GL_SetShader(data, SHADER_YUV_BT709);
+            } else if (texture->format == SDL_PIXELFORMAT_NV12) {
+                GL_SetShader(data, SHADER_NV12_BT709);
+            } else {
+                GL_SetShader(data, SHADER_NV21_BT709);
+            }
+            break;
+        default:
+            return SDL_SetError("Unsupported YUV conversion mode");
        }
    } else {
        GL_SetShader(data, SHADER_RGB);
--- a/src/render/opengl/SDL_shaders_gl.c
+++ b/src/render/opengl/SDL_shaders_gl.c
@ -62,6 +62,151 @@ struct GL_ShaderContext
    GL_ShaderData shaders[NUM_SHADERS];
 };

+#define COLOR_VERTEX_SHADER                                     \
+"varying vec4 v_color;\n"                                       \
+"\n"                                                            \
+"void main()\n"                                                 \
+"{\n"                                                           \
+"    gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;\n" \
+"    v_color = gl_Color;\n"                                     \
+"}"                                                             \
+
+#define TEXTURE_VERTEX_SHADER                                   \
+"varying vec4 v_color;\n"                                       \
+"varying vec2 v_texCoord;\n"                                    \
+"\n"                                                            \
+"void main()\n"                                                 \
+"{\n"                                                           \
+"    gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;\n" \
+"    v_color = gl_Color;\n"                                     \
+"    v_texCoord = vec2(gl_MultiTexCoord0);\n"                   \
+"}"                                                             \
+
+#define JPEG_SHADER_CONSTANTS                                   \
+"// YUV offset \n"                                              \
+"const vec3 offset = vec3(0, -0.501960814, -0.501960814);\n"    \
+"\n"                                                            \
+"// RGB coefficients \n"                                        \
+"const vec3 Rcoeff = vec3(1,  0.000,  1.402);\n"                \
+"const vec3 Gcoeff = vec3(1, -0.3441, -0.7141);\n"              \
+"const vec3 Bcoeff = vec3(1,  1.772,  0.000);\n"                \
+
+#define BT601_SHADER_CONSTANTS                                  \
+"// YUV offset \n"                                              \
+"const vec3 offset = vec3(-0.0627451017, -0.501960814, -0.501960814);\n" \
+"\n"                                                            \
+"// RGB coefficients \n"                                        \
+"const vec3 Rcoeff = vec3(1.1644,  0.000,  1.596);\n"           \
+"const vec3 Gcoeff = vec3(1.1644, -0.3918, -0.813);\n"          \
+"const vec3 Bcoeff = vec3(1.1644,  2.0172,  0.000);\n"          \
+
+#define BT709_SHADER_CONSTANTS                                  \
+"// YUV offset \n"                                              \
+"const vec3 offset = vec3(-0.0627451017, -0.501960814, -0.501960814);\n" \
+"\n"                                                            \
+"// RGB coefficients \n"                                        \
+"const vec3 Rcoeff = vec3(1.1644,  0.000,  1.7927);\n"          \
+"const vec3 Gcoeff = vec3(1.1644, -0.2132, -0.5329);\n"         \
+"const vec3 Bcoeff = vec3(1.1644,  2.1124,  0.000);\n"          \
+
+#define YUV_SHADER_PROLOGUE                                     \
+"varying vec4 v_color;\n"                                       \
+"varying vec2 v_texCoord;\n"                                    \
+"uniform sampler2D tex0; // Y \n"                               \
+"uniform sampler2D tex1; // U \n"                               \
+"uniform sampler2D tex2; // V \n"                               \
+"\n"                                                            \
+
+#define YUV_SHADER_BODY                                         \
+"\n"                                                            \
+"void main()\n"                                                 \
+"{\n"                                                           \
+"    vec2 tcoord;\n"                                            \
+"    vec3 yuv, rgb;\n"                                          \
+"\n"                                                            \
+"    // Get the Y value \n"                                     \
+"    tcoord = v_texCoord;\n"                                    \
+"    yuv.x = texture2D(tex0, tcoord).r;\n"                      \
+"\n"                                                            \
+"    // Get the U and V values \n"                              \
+"    tcoord *= UVCoordScale;\n"                                 \
+"    yuv.y = texture2D(tex1, tcoord).r;\n"                      \
+"    yuv.z = texture2D(tex2, tcoord).r;\n"                      \
+"\n"                                                            \
+"    // Do the color transform \n"                              \
+"    yuv += offset;\n"                                          \
+"    rgb.r = dot(yuv, Rcoeff);\n"                               \
+"    rgb.g = dot(yuv, Gcoeff);\n"                               \
+"    rgb.b = dot(yuv, Bcoeff);\n"                               \
+"\n"                                                            \
+"    // That was easy. :) \n"                                   \
+"    gl_FragColor = vec4(rgb, 1.0) * v_color;\n"                \
+"}"                                                             \
+
+#define NV12_SHADER_PROLOGUE                                    \
+"varying vec4 v_color;\n"                                       \
+"varying vec2 v_texCoord;\n"                                    \
+"uniform sampler2D tex0; // Y \n"                               \
+"uniform sampler2D tex1; // U/V \n"                             \
+"\n"                                                            \
+
+#define NV12_SHADER_BODY                                        \
+"\n"                                                            \
+"void main()\n"                                                 \
+"{\n"                                                           \
+"    vec2 tcoord;\n"                                            \
+"    vec3 yuv, rgb;\n"                                          \
+"\n"                                                            \
+"    // Get the Y value \n"                                     \
+"    tcoord = v_texCoord;\n"                                    \
+"    yuv.x = texture2D(tex0, tcoord).r;\n"                      \
+"\n"                                                            \
+"    // Get the U and V values \n"                              \
+"    tcoord *= UVCoordScale;\n"                                 \
+"    yuv.yz = texture2D(tex1, tcoord).ra;\n"                    \
+"\n"                                                            \
+"    // Do the color transform \n"                              \
+"    yuv += offset;\n"                                          \
+"    rgb.r = dot(yuv, Rcoeff);\n"                               \
+"    rgb.g = dot(yuv, Gcoeff);\n"                               \
+"    rgb.b = dot(yuv, Bcoeff);\n"                               \
+"\n"                                                            \
+"    // That was easy. :) \n"                                   \
+"    gl_FragColor = vec4(rgb, 1.0) * v_color;\n"                \
+"}"                                                             \
+
+#define NV21_SHADER_PROLOGUE                                    \
+"varying vec4 v_color;\n"                                       \
+"varying vec2 v_texCoord;\n"                                    \
+"uniform sampler2D tex0; // Y \n"                               \
+"uniform sampler2D tex1; // U/V \n"                             \
+"\n"                                                            \
+
+#define NV21_SHADER_BODY                                        \
+"\n"                                                            \
+"void main()\n"                                                 \
+"{\n"                                                           \
+"    vec2 tcoord;\n"                                            \
+"    vec3 yuv, rgb;\n"                                          \
+"\n"                                                            \
+"    // Get the Y value \n"                                     \
+"    tcoord = v_texCoord;\n"                                    \
+"    yuv.x = texture2D(tex0, tcoord).r;\n"                      \
+"\n"                                                            \
+"    // Get the U and V values \n"                              \
+"    tcoord *= UVCoordScale;\n"                                 \
+"    yuv.yz = texture2D(tex1, tcoord).ar;\n"                    \
+"\n"                                                            \
+"    // Do the color transform \n"                              \
+"    yuv += offset;\n"                                          \
+"    rgb.r = dot(yuv, Rcoeff);\n"                               \
+"    rgb.g = dot(yuv, Gcoeff);\n"                               \
+"    rgb.b = dot(yuv, Bcoeff);\n"                               \
+"\n"                                                            \
+"    // That was easy. :) \n"                                   \
+"    gl_FragColor = vec4(rgb, 1.0) * v_color;\n"                \
+"}"                                                             \
+
 /*
 * NOTE: Always use sampler2D, etc here. We'll #define them to the
 *  texture_rectangle versions if we choose to use that extension.
@ -74,13 +219,7 @@ static const char *shader_source[NUM_SHADERS][2] =
    /* SHADER_SOLID */
    {
        /* vertex shader */
-"varying vec4 v_color;\n"
-"\n"
-"void main()\n"
-"{\n"
-"    gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;\n"
-"    v_color = gl_Color;\n"
-"}",
+        COLOR_VERTEX_SHADER,
        /* fragment shader */
 "varying vec4 v_color;\n"
 "\n"
@ -93,15 +232,7 @@ static const char *shader_source[NUM_SHADERS][2] =
    /* SHADER_RGB */
    {
        /* vertex shader */
-"varying vec4 v_color;\n"
-"varying vec2 v_texCoord;\n"
-"\n"
-"void main()\n"
-"{\n"
-"    gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;\n"
-"    v_color = gl_Color;\n"
-"    v_texCoord = vec2(gl_MultiTexCoord0);\n"
-"}",
+        TEXTURE_VERTEX_SHADER,
        /* fragment shader */
 "varying vec4 v_color;\n"
 "varying vec2 v_texCoord;\n"
@ -113,156 +244,86 @@ static const char *shader_source[NUM_SHADERS][2] =
 "}"
    },

-    /* SHADER_YUV */
+    /* SHADER_YUV_JPEG */
    {
        /* vertex shader */
-"varying vec4 v_color;\n"
-"varying vec2 v_texCoord;\n"
-"\n"
-"void main()\n"
-"{\n"
-"    gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;\n"
-"    v_color = gl_Color;\n"
-"    v_texCoord = vec2(gl_MultiTexCoord0);\n"
-"}",
+        TEXTURE_VERTEX_SHADER,
        /* fragment shader */
-"varying vec4 v_color;\n"
-"varying vec2 v_texCoord;\n"
-"uniform sampler2D tex0; // Y \n"
-"uniform sampler2D tex1; // U \n"
-"uniform sampler2D tex2; // V \n"
-"\n"
-"// YUV offset \n"
-"const vec3 offset = vec3(-0.0627451017, -0.501960814, -0.501960814);\n"
-"\n"
-"// RGB coefficients \n"
-"const vec3 Rcoeff = vec3(1.164,  0.000,  1.596);\n"
-"const vec3 Gcoeff = vec3(1.164, -0.391, -0.813);\n"
-"const vec3 Bcoeff = vec3(1.164,  2.018,  0.000);\n"
-"\n"
-"void main()\n"
-"{\n"
-"    vec2 tcoord;\n"
-"    vec3 yuv, rgb;\n"
-"\n"
-"    // Get the Y value \n"
-"    tcoord = v_texCoord;\n"
-"    yuv.x = texture2D(tex0, tcoord).r;\n"
-"\n"
-"    // Get the U and V values \n"
-"    tcoord *= UVCoordScale;\n"
-"    yuv.y = texture2D(tex1, tcoord).r;\n"
-"    yuv.z = texture2D(tex2, tcoord).r;\n"
-"\n"
-"    // Do the color transform \n"
-"    yuv += offset;\n"
-"    rgb.r = dot(yuv, Rcoeff);\n"
-"    rgb.g = dot(yuv, Gcoeff);\n"
-"    rgb.b = dot(yuv, Bcoeff);\n"
-"\n"
-"    // That was easy. :) \n"
-"    gl_FragColor = vec4(rgb, 1.0) * v_color;\n"
-"}"
+        YUV_SHADER_PROLOGUE
+        JPEG_SHADER_CONSTANTS
+        YUV_SHADER_BODY
    },
-
-    /* SHADER_NV12 */
+    /* SHADER_YUV_BT601 */
    {
        /* vertex shader */
-"varying vec4 v_color;\n"
-"varying vec2 v_texCoord;\n"
-"\n"
-"void main()\n"
-"{\n"
-"    gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;\n"
-"    v_color = gl_Color;\n"
-"    v_texCoord = vec2(gl_MultiTexCoord0);\n"
-"}",
+        TEXTURE_VERTEX_SHADER,
        /* fragment shader */
-"varying vec4 v_color;\n"
-"varying vec2 v_texCoord;\n"
-"uniform sampler2D tex0; // Y \n"
-"uniform sampler2D tex1; // U/V \n"
-"\n"
-"// YUV offset \n"
-"const vec3 offset = vec3(-0.0627451017, -0.501960814, -0.501960814);\n"
-"\n"
-"// RGB coefficients \n"
-"const vec3 Rcoeff = vec3(1.164,  0.000,  1.596);\n"
-"const vec3 Gcoeff = vec3(1.164, -0.391, -0.813);\n"
-"const vec3 Bcoeff = vec3(1.164,  2.018,  0.000);\n"
-"\n"
-"void main()\n"
-"{\n"
-"    vec2 tcoord;\n"
-"    vec3 yuv, rgb;\n"
-"\n"
-"    // Get the Y value \n"
-"    tcoord = v_texCoord;\n"
-"    yuv.x = texture2D(tex0, tcoord).r;\n"
-"\n"
-"    // Get the U and V values \n"
-"    tcoord *= UVCoordScale;\n"
-"    yuv.yz = texture2D(tex1, tcoord).ra;\n"
-"\n"
-"    // Do the color transform \n"
-"    yuv += offset;\n"
-"    rgb.r = dot(yuv, Rcoeff);\n"
-"    rgb.g = dot(yuv, Gcoeff);\n"
-"    rgb.b = dot(yuv, Bcoeff);\n"
-"\n"
-"    // That was easy. :) \n"
-"    gl_FragColor = vec4(rgb, 1.0) * v_color;\n"
-"}"
+        YUV_SHADER_PROLOGUE
+        BT601_SHADER_CONSTANTS
+        YUV_SHADER_BODY
    },
-
-    /* SHADER_NV21 */
+    /* SHADER_YUV_BT709 */
    {
        /* vertex shader */
-"varying vec4 v_color;\n"
-"varying vec2 v_texCoord;\n"
-"\n"
-"void main()\n"
-"{\n"
-"    gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;\n"
-"    v_color = gl_Color;\n"
-"    v_texCoord = vec2(gl_MultiTexCoord0);\n"
-"}",
+        TEXTURE_VERTEX_SHADER,
        /* fragment shader */
-"varying vec4 v_color;\n"
-"varying vec2 v_texCoord;\n"
-"uniform sampler2D tex0; // Y \n"
-"uniform sampler2D tex1; // U/V \n"
-"\n"
-"// YUV offset \n"
-"const vec3 offset = vec3(-0.0627451017, -0.501960814, -0.501960814);\n"
-"\n"
-"// RGB coefficients \n"
-"const vec3 Rcoeff = vec3(1.164,  0.000,  1.596);\n"
-"const vec3 Gcoeff = vec3(1.164, -0.391, -0.813);\n"
-"const vec3 Bcoeff = vec3(1.164,  2.018,  0.000);\n"
-"\n"
-"void main()\n"
-"{\n"
-"    vec2 tcoord;\n"
-"    vec3 yuv, rgb;\n"
-"\n"
-"    // Get the Y value \n"
-"    tcoord = v_texCoord;\n"
-"    yuv.x = texture2D(tex0, tcoord).r;\n"
-"\n"
-"    // Get the U and V values \n"
-"    tcoord *= UVCoordScale;\n"
-"    yuv.yz = texture2D(tex1, tcoord).ar;\n"
-"\n"
-"    // Do the color transform \n"
-"    yuv += offset;\n"
-"    rgb.r = dot(yuv, Rcoeff);\n"
-"    rgb.g = dot(yuv, Gcoeff);\n"
-"    rgb.b = dot(yuv, Bcoeff);\n"
-"\n"
-"    // That was easy. :) \n"
-"    gl_FragColor = vec4(rgb, 1.0) * v_color;\n"
-"}"
+        YUV_SHADER_PROLOGUE
+        BT709_SHADER_CONSTANTS
+        YUV_SHADER_BODY
+    },
+    /* SHADER_NV12_JPEG */
+    {
+        /* vertex shader */
+        TEXTURE_VERTEX_SHADER,
+        /* fragment shader */
+        NV12_SHADER_PROLOGUE
+        JPEG_SHADER_CONSTANTS
+        NV12_SHADER_BODY
+    },
+    /* SHADER_NV12_BT601 */
+    {
+        /* vertex shader */
+        TEXTURE_VERTEX_SHADER,
+        /* fragment shader */
+        NV12_SHADER_PROLOGUE
+        BT601_SHADER_CONSTANTS
+        NV12_SHADER_BODY
+    },
+    /* SHADER_NV12_BT709 */
+    {
+        /* vertex shader */
+        TEXTURE_VERTEX_SHADER,
+        /* fragment shader */
+        NV12_SHADER_PROLOGUE
+        BT709_SHADER_CONSTANTS
+        NV12_SHADER_BODY
+    },
+    /* SHADER_NV21_JPEG */
+    {
+        /* vertex shader */
+        TEXTURE_VERTEX_SHADER,
+        /* fragment shader */
+        NV21_SHADER_PROLOGUE
+        JPEG_SHADER_CONSTANTS
+        NV21_SHADER_BODY
+    },
+    /* SHADER_NV21_BT601 */
+    {
+        /* vertex shader */
+        TEXTURE_VERTEX_SHADER,
+        /* fragment shader */
+        NV21_SHADER_PROLOGUE
+        BT601_SHADER_CONSTANTS
+        NV21_SHADER_BODY
+    },
+    /* SHADER_NV21_BT709 */
+    {
+        /* vertex shader */
+        TEXTURE_VERTEX_SHADER,
+        /* fragment shader */
+        NV21_SHADER_PROLOGUE
+        BT709_SHADER_CONSTANTS
+        NV21_SHADER_BODY
    },
 };

--- a/src/render/opengl/SDL_shaders_gl.h
+++ b/src/render/opengl/SDL_shaders_gl.h
@ -26,9 +26,15 @@ typedef enum {
    SHADER_NONE,
    SHADER_SOLID,
    SHADER_RGB,
-    SHADER_YUV,
-    SHADER_NV12,
-    SHADER_NV21,
+    SHADER_YUV_JPEG,
+    SHADER_YUV_BT601,
+    SHADER_YUV_BT709,
+    SHADER_NV12_JPEG,
+    SHADER_NV12_BT601,
+    SHADER_NV12_BT709,
+    SHADER_NV21_JPEG,
+    SHADER_NV21_BT601,
+    SHADER_NV21_BT709,
    NUM_SHADERS
 } GL_Shader;

--- a/src/render/opengles2/SDL_render_gles2.c
+++ b/src/render/opengles2/SDL_render_gles2.c
@ -950,7 +950,7 @@ static void GLES2_EvictShader(SDL_Renderer *renderer, GLES2_ShaderCacheEntry *en
 static GLES2_ProgramCacheEntry *GLES2_CacheProgram(SDL_Renderer *renderer,
                                                   GLES2_ShaderCacheEntry *vertex,
                                                   GLES2_ShaderCacheEntry *fragment);
-static int GLES2_SelectProgram(SDL_Renderer *renderer, GLES2_ImageSource source);
+static int GLES2_SelectProgram(SDL_Renderer *renderer, GLES2_ImageSource source, int w, int h);

 static GLES2_ProgramCacheEntry *
 GLES2_CacheProgram(SDL_Renderer *renderer, GLES2_ShaderCacheEntry *vertex,
@ -1189,7 +1189,7 @@ GLES2_EvictShader(SDL_Renderer *renderer, GLES2_ShaderCacheEntry *entry)
 }

 static int
-GLES2_SelectProgram(SDL_Renderer *renderer, GLES2_ImageSource source)
+GLES2_SelectProgram(SDL_Renderer *renderer, GLES2_ImageSource source, int w, int h)
 {
    GLES2_DriverContext *data = (GLES2_DriverContext *)renderer->driverdata;
    GLES2_ShaderCacheEntry *vertex = NULL;
@ -1216,13 +1216,52 @@ GLES2_SelectProgram(SDL_Renderer *renderer, GLES2_ImageSource source)
        ftype = GLES2_SHADER_FRAGMENT_TEXTURE_BGR_SRC;
        break;
    case GLES2_IMAGESOURCE_TEXTURE_YUV:
-        ftype = GLES2_SHADER_FRAGMENT_TEXTURE_YUV_SRC;
+        switch (SDL_GetYUVConversionModeForResolution(w, h)) {
+        case SDL_YUV_CONVERSION_JPEG:
+            ftype = GLES2_SHADER_FRAGMENT_TEXTURE_YUV_JPEG_SRC;
+            break;
+        case SDL_YUV_CONVERSION_BT601:
+            ftype = GLES2_SHADER_FRAGMENT_TEXTURE_YUV_BT601_SRC;
+            break;
+        case SDL_YUV_CONVERSION_BT709:
+            ftype = GLES2_SHADER_FRAGMENT_TEXTURE_YUV_BT709_SRC;
+            break;
+        default:
+            SDL_SetError("Unsupported YUV conversion mode: %d\n", SDL_GetYUVConversionModeForResolution(w, h));
+            goto fault;
+        }
        break;
    case GLES2_IMAGESOURCE_TEXTURE_NV12:
-        ftype = GLES2_SHADER_FRAGMENT_TEXTURE_NV12_SRC;
+        switch (SDL_GetYUVConversionModeForResolution(w, h)) {
+        case SDL_YUV_CONVERSION_JPEG:
+            ftype = GLES2_SHADER_FRAGMENT_TEXTURE_NV12_JPEG_SRC;
+            break;
+        case SDL_YUV_CONVERSION_BT601:
+            ftype = GLES2_SHADER_FRAGMENT_TEXTURE_NV12_BT601_SRC;
+            break;
+        case SDL_YUV_CONVERSION_BT709:
+            ftype = GLES2_SHADER_FRAGMENT_TEXTURE_NV12_BT709_SRC;
+            break;
+        default:
+            SDL_SetError("Unsupported YUV conversion mode: %d\n", SDL_GetYUVConversionModeForResolution(w, h));
+            goto fault;
+        }
        break;
    case GLES2_IMAGESOURCE_TEXTURE_NV21:
-        ftype = GLES2_SHADER_FRAGMENT_TEXTURE_NV21_SRC;
+        switch (SDL_GetYUVConversionModeForResolution(w, h)) {
+        case SDL_YUV_CONVERSION_JPEG:
+            ftype = GLES2_SHADER_FRAGMENT_TEXTURE_NV21_JPEG_SRC;
+            break;
+        case SDL_YUV_CONVERSION_BT601:
+            ftype = GLES2_SHADER_FRAGMENT_TEXTURE_NV21_BT601_SRC;
+            break;
+        case SDL_YUV_CONVERSION_BT709:
+            ftype = GLES2_SHADER_FRAGMENT_TEXTURE_NV21_BT709_SRC;
+            break;
+        default:
+            SDL_SetError("Unsupported YUV conversion mode: %d\n", SDL_GetYUVConversionModeForResolution(w, h));
+            goto fault;
+        }
        break;
    default:
        goto fault;
@ -1445,7 +1484,7 @@ GLES2_SetDrawingState(SDL_Renderer * renderer)
    GLES2_SetTexCoords(data, SDL_FALSE);

    /* Activate an appropriate shader and set the projection matrix */
-    if (GLES2_SelectProgram(renderer, GLES2_IMAGESOURCE_SOLID) < 0) {
+    if (GLES2_SelectProgram(renderer, GLES2_IMAGESOURCE_SOLID, 0, 0) < 0) {
        return -1;
    }

@ -1707,7 +1746,7 @@ GLES2_SetupCopy(SDL_Renderer *renderer, SDL_Texture *texture)
        }
    }

-    if (GLES2_SelectProgram(renderer, sourceType) < 0) {
+    if (GLES2_SelectProgram(renderer, sourceType, texture->w, texture->h) < 0) {
        return -1;
    }

--- a/src/render/opengles2/SDL_shaders_gles2.c
+++ b/src/render/opengles2/SDL_shaders_gles2.c
@ -126,73 +126,154 @@ static const Uint8 GLES2_FragmentSrc_TextureBGRSrc_[] = " \
    } \
 ";

+#define JPEG_SHADER_CONSTANTS                                   \
+"// YUV offset \n"                                              \
+"const vec3 offset = vec3(0, -0.501960814, -0.501960814);\n"    \
+"\n"                                                            \
+"// RGB coefficients \n"                                        \
+"const mat3 matrix = mat3( 1,       1,        1,\n"             \
+"                          0,      -0.3441,   1.772,\n"         \
+"                          1.402,  -0.7141,   0);\n"            \
+
+#define BT601_SHADER_CONSTANTS                                  \
+"// YUV offset \n"                                              \
+"const vec3 offset = vec3(-0.0627451017, -0.501960814, -0.501960814);\n" \
+"\n"                                                            \
+"// RGB coefficients \n"                                        \
+"const mat3 matrix = mat3( 1.1644,  1.1644,   1.1644,\n"        \
+"                          0,      -0.3918,   2.0172,\n"        \
+"                          1.596,  -0.813,    0);\n"            \
+
+#define BT709_SHADER_CONSTANTS                                  \
+"// YUV offset \n"                                              \
+"const vec3 offset = vec3(-0.0627451017, -0.501960814, -0.501960814);\n" \
+"\n"                                                            \
+"// RGB coefficients \n"                                        \
+"const mat3 matrix = mat3( 1.1644,  1.1644,   1.1644,\n"        \
+"                          0,      -0.2132,   2.1124,\n"        \
+"                          1.7927, -0.5329,   0);\n"            \
+
+
+#define YUV_SHADER_PROLOGUE                                     \
+"precision mediump float;\n"                                    \
+"uniform sampler2D u_texture;\n"                                \
+"uniform sampler2D u_texture_u;\n"                              \
+"uniform sampler2D u_texture_v;\n"                              \
+"uniform vec4 u_modulation;\n"                                  \
+"varying vec2 v_texCoord;\n"                                    \
+"\n"                                                            \
+
+#define YUV_SHADER_BODY                                         \
+"\n"                                                            \
+"void main()\n"                                                 \
+"{\n"                                                           \
+"    mediump vec3 yuv;\n"                                       \
+"    lowp vec3 rgb;\n"                                          \
+"\n"                                                            \
+"    // Get the YUV values \n"                                  \
+"    yuv.x = texture2D(u_texture,   v_texCoord).r;\n"           \
+"    yuv.y = texture2D(u_texture_u, v_texCoord).r;\n"           \
+"    yuv.z = texture2D(u_texture_v, v_texCoord).r;\n"           \
+"\n"                                                            \
+"    // Do the color transform \n"                              \
+"    yuv += offset;\n"                                          \
+"    rgb = matrix * yuv;\n"                                     \
+"\n"                                                            \
+"    // That was easy. :) \n"                                   \
+"    gl_FragColor = vec4(rgb, 1);\n"                            \
+"    gl_FragColor *= u_modulation;\n"                           \
+"}"                                                             \
+
+#define NV12_SHADER_BODY                                        \
+"\n"                                                            \
+"void main()\n"                                                 \
+"{\n"                                                           \
+"    mediump vec3 yuv;\n"                                       \
+"    lowp vec3 rgb;\n"                                          \
+"\n"                                                            \
+"    // Get the YUV values \n"                                  \
+"    yuv.x = texture2D(u_texture,   v_texCoord).r;\n"           \
+"    yuv.yz = texture2D(u_texture_u, v_texCoord).ra;\n"         \
+"\n"                                                            \
+"    // Do the color transform \n"                              \
+"    yuv += offset;\n"                                          \
+"    rgb = matrix * yuv;\n"                                     \
+"\n"                                                            \
+"    // That was easy. :) \n"                                   \
+"    gl_FragColor = vec4(rgb, 1);\n"                            \
+"    gl_FragColor *= u_modulation;\n"                           \
+"}"                                                             \
+
+#define NV21_SHADER_BODY                                        \
+"\n"                                                            \
+"void main()\n"                                                 \
+"{\n"                                                           \
+"    mediump vec3 yuv;\n"                                       \
+"    lowp vec3 rgb;\n"                                          \
+"\n"                                                            \
+"    // Get the YUV values \n"                                  \
+"    yuv.x = texture2D(u_texture,   v_texCoord).r;\n"           \
+"    yuv.yz = texture2D(u_texture_u, v_texCoord).ar;\n"         \
+"\n"                                                            \
+"    // Do the color transform \n"                              \
+"    yuv += offset;\n"                                          \
+"    rgb = matrix * yuv;\n"                                     \
+"\n"                                                            \
+"    // That was easy. :) \n"                                   \
+"    gl_FragColor = vec4(rgb, 1);\n"                            \
+"    gl_FragColor *= u_modulation;\n"                           \
+"}"                                                             \
+
 /* YUV to ABGR conversion */
-static const Uint8 GLES2_FragmentSrc_TextureYUVSrc_[] = " \
-    precision mediump float; \
-    uniform sampler2D u_texture; \
-    uniform sampler2D u_texture_u; \
-    uniform sampler2D u_texture_v; \
-    uniform vec4 u_modulation; \
-    varying vec2 v_texCoord; \
-    \
-    void main() \
-    { \
-        mediump vec3 yuv; \
-        lowp vec3 rgb; \
-        yuv.x = texture2D(u_texture,   v_texCoord).r; \
-        yuv.y = texture2D(u_texture_u, v_texCoord).r - 0.5; \
-        yuv.z = texture2D(u_texture_v, v_texCoord).r - 0.5; \
-        rgb = mat3( 1,        1,       1, \
-                    0,       -0.39465, 2.03211, \
-                    1.13983, -0.58060, 0) * yuv; \
-        gl_FragColor = vec4(rgb, 1); \
-        gl_FragColor *= u_modulation; \
-    } \
-";
+static const Uint8 GLES2_FragmentSrc_TextureYUVJPEGSrc_[] = \
+        YUV_SHADER_PROLOGUE \
+        JPEG_SHADER_CONSTANTS \
+        YUV_SHADER_BODY \
+;
+static const Uint8 GLES2_FragmentSrc_TextureYUVBT601Src_[] = \
+        YUV_SHADER_PROLOGUE \
+        BT601_SHADER_CONSTANTS \
+        YUV_SHADER_BODY \
+;
+static const Uint8 GLES2_FragmentSrc_TextureYUVBT709Src_[] = \
+        YUV_SHADER_PROLOGUE \
+        BT709_SHADER_CONSTANTS \
+        YUV_SHADER_BODY \
+;

 /* NV12 to ABGR conversion */
-static const Uint8 GLES2_FragmentSrc_TextureNV12Src_[] = " \
-    precision mediump float; \
-    uniform sampler2D u_texture; \
-    uniform sampler2D u_texture_u; \
-    uniform vec4 u_modulation; \
-    varying vec2 v_texCoord; \
-    \
-    void main() \
-    { \
-        mediump vec3 yuv; \
-        lowp vec3 rgb; \
-        yuv.x = texture2D(u_texture,   v_texCoord).r; \
-        yuv.yz = texture2D(u_texture_u, v_texCoord).ra - 0.5; \
-        rgb = mat3( 1,        1,       1, \
-                    0,       -0.39465, 2.03211, \
-                    1.13983, -0.58060, 0) * yuv; \
-        gl_FragColor = vec4(rgb, 1); \
-        gl_FragColor *= u_modulation; \
-    } \
-";
+static const Uint8 GLES2_FragmentSrc_TextureNV12JPEGSrc_[] = \
+        YUV_SHADER_PROLOGUE \
+        JPEG_SHADER_CONSTANTS \
+        NV12_SHADER_BODY \
+;
+static const Uint8 GLES2_FragmentSrc_TextureNV12BT601Src_[] = \
+        YUV_SHADER_PROLOGUE \
+        BT601_SHADER_CONSTANTS \
+        NV12_SHADER_BODY \
+;
+static const Uint8 GLES2_FragmentSrc_TextureNV12BT709Src_[] = \
+        YUV_SHADER_PROLOGUE \
+        BT709_SHADER_CONSTANTS \
+        NV12_SHADER_BODY \
+;

 /* NV21 to ABGR conversion */
-static const Uint8 GLES2_FragmentSrc_TextureNV21Src_[] = " \
-    precision mediump float; \
-    uniform sampler2D u_texture; \
-    uniform sampler2D u_texture_u; \
-    uniform vec4 u_modulation; \
-    varying vec2 v_texCoord; \
-    \
-    void main() \
-    { \
-        mediump vec3 yuv; \
-        lowp vec3 rgb; \
-        yuv.x = texture2D(u_texture,   v_texCoord).r; \
-        yuv.yz = texture2D(u_texture_u, v_texCoord).ar - 0.5; \
-        rgb = mat3( 1,        1,       1, \
-                    0,       -0.39465, 2.03211, \
-                    1.13983, -0.58060, 0) * yuv; \
-        gl_FragColor = vec4(rgb, 1); \
-        gl_FragColor *= u_modulation; \
-    } \
-";
+static const Uint8 GLES2_FragmentSrc_TextureNV21JPEGSrc_[] = \
+        YUV_SHADER_PROLOGUE \
+        JPEG_SHADER_CONSTANTS \
+        NV21_SHADER_BODY \
+;
+static const Uint8 GLES2_FragmentSrc_TextureNV21BT601Src_[] = \
+        YUV_SHADER_PROLOGUE \
+        BT601_SHADER_CONSTANTS \
+        NV21_SHADER_BODY \
+;
+static const Uint8 GLES2_FragmentSrc_TextureNV21BT709Src_[] = \
+        YUV_SHADER_PROLOGUE \
+        BT709_SHADER_CONSTANTS \
+        NV21_SHADER_BODY \
+;

 static const GLES2_ShaderInstance GLES2_VertexSrc_Default = {
    GL_VERTEX_SHADER,
@ -236,25 +317,67 @@ static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureBGRSrc = {
    GLES2_FragmentSrc_TextureBGRSrc_
 };

-static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureYUVSrc = {
+static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureYUVJPEGSrc = {
    GL_FRAGMENT_SHADER,
    GLES2_SOURCE_SHADER,
-    sizeof(GLES2_FragmentSrc_TextureYUVSrc_),
-    GLES2_FragmentSrc_TextureYUVSrc_
+    sizeof(GLES2_FragmentSrc_TextureYUVJPEGSrc_),
+    GLES2_FragmentSrc_TextureYUVJPEGSrc_
 };

-static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureNV12Src = {
+static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureYUVBT601Src = {
    GL_FRAGMENT_SHADER,
    GLES2_SOURCE_SHADER,
-    sizeof(GLES2_FragmentSrc_TextureNV12Src_),
-    GLES2_FragmentSrc_TextureNV12Src_
+    sizeof(GLES2_FragmentSrc_TextureYUVBT601Src_),
+    GLES2_FragmentSrc_TextureYUVBT601Src_
 };

-static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureNV21Src = {
+static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureYUVBT709Src = {
    GL_FRAGMENT_SHADER,
    GLES2_SOURCE_SHADER,
-    sizeof(GLES2_FragmentSrc_TextureNV21Src_),
-    GLES2_FragmentSrc_TextureNV21Src_
+    sizeof(GLES2_FragmentSrc_TextureYUVBT709Src_),
+    GLES2_FragmentSrc_TextureYUVBT709Src_
+};
+
+static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureNV12JPEGSrc = {
+    GL_FRAGMENT_SHADER,
+    GLES2_SOURCE_SHADER,
+    sizeof(GLES2_FragmentSrc_TextureNV12JPEGSrc_),
+    GLES2_FragmentSrc_TextureNV12JPEGSrc_
+};
+
+static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureNV12BT601Src = {
+    GL_FRAGMENT_SHADER,
+    GLES2_SOURCE_SHADER,
+    sizeof(GLES2_FragmentSrc_TextureNV12BT601Src_),
+    GLES2_FragmentSrc_TextureNV12BT601Src_
+};
+
+static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureNV21BT709Src = {
+    GL_FRAGMENT_SHADER,
+    GLES2_SOURCE_SHADER,
+    sizeof(GLES2_FragmentSrc_TextureNV21BT709Src_),
+    GLES2_FragmentSrc_TextureNV21BT709Src_
+};
+
+static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureNV21JPEGSrc = {
+    GL_FRAGMENT_SHADER,
+    GLES2_SOURCE_SHADER,
+    sizeof(GLES2_FragmentSrc_TextureNV21JPEGSrc_),
+    GLES2_FragmentSrc_TextureNV21JPEGSrc_
+};
+
+static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureNV21BT601Src = {
+    GL_FRAGMENT_SHADER,
+    GLES2_SOURCE_SHADER,
+    sizeof(GLES2_FragmentSrc_TextureNV21BT601Src_),
+    GLES2_FragmentSrc_TextureNV21BT601Src_
+};
+
+static const GLES2_ShaderInstance GLES2_FragmentSrc_TextureNV12BT709Src = {
+    GL_FRAGMENT_SHADER,
+    GLES2_SOURCE_SHADER,
+    sizeof(GLES2_FragmentSrc_TextureNV12BT709Src_),
+    GLES2_FragmentSrc_TextureNV12BT709Src_
 };


@ -304,24 +427,66 @@ static GLES2_Shader GLES2_FragmentShader_TextureBGRSrc = {
    }
 };

-static GLES2_Shader GLES2_FragmentShader_TextureYUVSrc = {
+static GLES2_Shader GLES2_FragmentShader_TextureYUVJPEGSrc = {
    1,
    {
-        &GLES2_FragmentSrc_TextureYUVSrc
+        &GLES2_FragmentSrc_TextureYUVJPEGSrc
    }
 };

-static GLES2_Shader GLES2_FragmentShader_TextureNV12Src = {
+static GLES2_Shader GLES2_FragmentShader_TextureYUVBT601Src = {
    1,
    {
-        &GLES2_FragmentSrc_TextureNV12Src
+        &GLES2_FragmentSrc_TextureYUVBT601Src
    }
 };

-static GLES2_Shader GLES2_FragmentShader_TextureNV21Src = {
+static GLES2_Shader GLES2_FragmentShader_TextureYUVBT709Src = {
    1,
    {
-        &GLES2_FragmentSrc_TextureNV21Src
+        &GLES2_FragmentSrc_TextureYUVBT709Src
+    }
+};
+
+static GLES2_Shader GLES2_FragmentShader_TextureNV12JPEGSrc = {
+    1,
+    {
+        &GLES2_FragmentSrc_TextureNV12JPEGSrc
+    }
+};
+
+static GLES2_Shader GLES2_FragmentShader_TextureNV12BT601Src = {
+    1,
+    {
+        &GLES2_FragmentSrc_TextureNV12BT601Src
+    }
+};
+
+static GLES2_Shader GLES2_FragmentShader_TextureNV12BT709Src = {
+    1,
+    {
+        &GLES2_FragmentSrc_TextureNV12BT709Src
+    }
+};
+
+static GLES2_Shader GLES2_FragmentShader_TextureNV21JPEGSrc = {
+    1,
+    {
+        &GLES2_FragmentSrc_TextureNV21JPEGSrc
+    }
+};
+
+static GLES2_Shader GLES2_FragmentShader_TextureNV21BT601Src = {
+    1,
+    {
+        &GLES2_FragmentSrc_TextureNV21BT601Src
+    }
+};
+
+static GLES2_Shader GLES2_FragmentShader_TextureNV21BT709Src = {
+    1,
+    {
+        &GLES2_FragmentSrc_TextureNV21BT709Src
    }
 };

@ -345,12 +510,24 @@ const GLES2_Shader *GLES2_GetShader(GLES2_ShaderType type)
        return &GLES2_FragmentShader_TextureRGBSrc;
    case GLES2_SHADER_FRAGMENT_TEXTURE_BGR_SRC:
        return &GLES2_FragmentShader_TextureBGRSrc;
-    case GLES2_SHADER_FRAGMENT_TEXTURE_YUV_SRC:
-        return &GLES2_FragmentShader_TextureYUVSrc;
-    case GLES2_SHADER_FRAGMENT_TEXTURE_NV12_SRC:
-        return &GLES2_FragmentShader_TextureNV12Src;
-    case GLES2_SHADER_FRAGMENT_TEXTURE_NV21_SRC:
-        return &GLES2_FragmentShader_TextureNV21Src;
+    case GLES2_SHADER_FRAGMENT_TEXTURE_YUV_JPEG_SRC:
+        return &GLES2_FragmentShader_TextureYUVJPEGSrc;
+    case GLES2_SHADER_FRAGMENT_TEXTURE_YUV_BT601_SRC:
+        return &GLES2_FragmentShader_TextureYUVBT601Src;
+    case GLES2_SHADER_FRAGMENT_TEXTURE_YUV_BT709_SRC:
+        return &GLES2_FragmentShader_TextureYUVBT709Src;
+    case GLES2_SHADER_FRAGMENT_TEXTURE_NV12_JPEG_SRC:
+        return &GLES2_FragmentShader_TextureNV12JPEGSrc;
+    case GLES2_SHADER_FRAGMENT_TEXTURE_NV12_BT601_SRC:
+        return &GLES2_FragmentShader_TextureNV12BT601Src;
+    case GLES2_SHADER_FRAGMENT_TEXTURE_NV12_BT709_SRC:
+        return &GLES2_FragmentShader_TextureNV12BT709Src;
+    case GLES2_SHADER_FRAGMENT_TEXTURE_NV21_JPEG_SRC:
+        return &GLES2_FragmentShader_TextureNV21JPEGSrc;
+    case GLES2_SHADER_FRAGMENT_TEXTURE_NV21_BT601_SRC:
+        return &GLES2_FragmentShader_TextureNV21BT601Src;
+    case GLES2_SHADER_FRAGMENT_TEXTURE_NV21_BT709_SRC:
+        return &GLES2_FragmentShader_TextureNV21BT709Src;
    default:
        return NULL;
    }
--- a/src/render/opengles2/SDL_shaders_gles2.h
+++ b/src/render/opengles2/SDL_shaders_gles2.h
@ -20,11 +20,11 @@
 */
 #include "../../SDL_internal.h"

-#if SDL_VIDEO_RENDER_OGL_ES2
-
 #ifndef SDL_shaders_gles2_h_
 #define SDL_shaders_gles2_h_

+#if SDL_VIDEO_RENDER_OGL_ES2
+
 typedef struct GLES2_ShaderInstance
 {
    GLenum type;
@ -47,17 +47,23 @@ typedef enum
    GLES2_SHADER_FRAGMENT_TEXTURE_ARGB_SRC,
    GLES2_SHADER_FRAGMENT_TEXTURE_BGR_SRC,
    GLES2_SHADER_FRAGMENT_TEXTURE_RGB_SRC,
-    GLES2_SHADER_FRAGMENT_TEXTURE_YUV_SRC,
-    GLES2_SHADER_FRAGMENT_TEXTURE_NV12_SRC,
-    GLES2_SHADER_FRAGMENT_TEXTURE_NV21_SRC
+    GLES2_SHADER_FRAGMENT_TEXTURE_YUV_JPEG_SRC,
+    GLES2_SHADER_FRAGMENT_TEXTURE_YUV_BT601_SRC,
+    GLES2_SHADER_FRAGMENT_TEXTURE_YUV_BT709_SRC,
+    GLES2_SHADER_FRAGMENT_TEXTURE_NV12_JPEG_SRC,
+    GLES2_SHADER_FRAGMENT_TEXTURE_NV12_BT601_SRC,
+    GLES2_SHADER_FRAGMENT_TEXTURE_NV12_BT709_SRC,
+    GLES2_SHADER_FRAGMENT_TEXTURE_NV21_JPEG_SRC,
+    GLES2_SHADER_FRAGMENT_TEXTURE_NV21_BT601_SRC,
+    GLES2_SHADER_FRAGMENT_TEXTURE_NV21_BT709_SRC,
 } GLES2_ShaderType;

 #define GLES2_SOURCE_SHADER (GLenum)-1

 const GLES2_Shader *GLES2_GetShader(GLES2_ShaderType type);

-#endif /* SDL_shaders_gles2_h_ */
-
 #endif /* SDL_VIDEO_RENDER_OGL_ES2 */

+#endif /* SDL_shaders_gles2_h_ */
+
 /* vi: set ts=4 sw=4 expandtab: */