From 6655a94c5c7e4c845cb5b6c9c3aa1df04054a48b Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Tue, 19 Mar 2013 11:50:20 -0400
Subject: [PATCH] mesa-9.1-53-gd0ccb5b.patch: Sync with today's git

---
 mesa-9.1-53-gd0ccb5b.patch | 1974 ++++++++++++++++++++++++++++++++++++
 mesa.spec                  |   25 +-
 2 files changed, 1988 insertions(+), 11 deletions(-)
 create mode 100644 mesa-9.1-53-gd0ccb5b.patch
diff --git a/mesa-9.1-53-gd0ccb5b.patch b/mesa-9.1-53-gd0ccb5b.patch
new file mode 100644
index 0000000..66b13fc
--- /dev/null
+++ b/mesa-9.1-53-gd0ccb5b.patch
@@ -0,0 +1,1974 @@
+diff --git a/bin/get-pick-list.sh b/bin/get-pick-list.sh
+index a141afe..d3ac511 100755
+--- a/bin/get-pick-list.sh
++++ b/bin/get-pick-list.sh
+@@ -8,7 +8,7 @@ git log --reverse --grep="cherry picked from commit" origin/master..HEAD |\
+ 	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked
+ 
+ # Grep for commits that were marked as a candidate for the stable tree.
+-git log --reverse --pretty=%H -i --grep='^[[:space:]]*NOTE: This is a candidate' HEAD..origin/master |\
++git log --reverse --pretty=%H -i --grep='^[[:space:]]*NOTE: .*[Cc]andidate' HEAD..origin/master |\
+ while read sha
+ do
+ 	# Check to see whether the patch is on the ignore list.
+diff --git a/common.py b/common.py
+index 6ff9608..1d618e6 100644
+--- a/common.py
++++ b/common.py
+@@ -100,4 +100,4 @@ def AddOptions(opts):
+ 	opts.Add(BoolOption('quiet', 'DEPRECATED: profile build', 'yes'))
+ 	opts.Add(BoolOption('texture_float', 'enable floating-point textures and renderbuffers', 'no'))
+ 	if host_platform == 'windows':
+-		opts.Add(EnumOption('MSVS_VERSION', 'MS Visual C++ version', None, allowed_values=('7.1', '8.0', '9.0')))
++		opts.Add(EnumOption('MSVC_VERSION', 'MS Visual C++ version', None, allowed_values=('7.1', '8.0', '9.0', '10.0', '11.0')))
+diff --git a/configure.ac b/configure.ac
+index 5701f8a..d75cf65 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -1682,6 +1682,9 @@ if test "x$enable_gallium_llvm" = xyes; then
+         if $LLVM_CONFIG --components | grep -q '\<mcjit\>'; then
+             LLVM_COMPONENTS="${LLVM_COMPONENTS} mcjit"
+         fi
++        if $LLVM_CONFIG --components | grep -q '\<oprofilejit\>'; then
++            LLVM_COMPONENTS="${LLVM_COMPONENTS} oprofilejit"
++        fi
+ 
+         if test "x$enable_opencl" = xyes; then
+             LLVM_COMPONENTS="${LLVM_COMPONENTS} ipo linker instrumentation"
+diff --git a/docs/index.html b/docs/index.html
+index 5c92204..5d7229d 100644
+--- a/docs/index.html
++++ b/docs/index.html
+@@ -16,6 +16,23 @@
+ 
+ <h1>News</h1>
+ 
++<h2>February 22, 2013</h2>
++
++<p>
++<a href="relnotes-9.1.html">Mesa 9.1</a> is released.
++This is a new development release.
++See the release notes for more information about the release.
++</p>
++
++
++<h2>February 21, 2013</h2>
++
++<p>
++<a href="relnotes-9.0.3.html">Mesa 9.0.3</a> is released.
++This is a bug fix release.
++</p>
++
++
+ <h2>January 22, 2013</h2>
+ 
+ <p>
+diff --git a/docs/relnotes-9.1.html b/docs/relnotes-9.1.html
+index 24ba9f9..8232ab8 100644
+--- a/docs/relnotes-9.1.html
++++ b/docs/relnotes-9.1.html
+@@ -14,7 +14,7 @@
+ <iframe src="contents.html"></iframe>
+ <div class="content">
+ 
+-<h1>Mesa 9.1 Release Notes / date February 22, 2013</h1>
++<h1>Mesa 9.1 Release Notes / February 22, 2013</h1>
+ 
+ <p>
+ Mesa 9.1 is a new development release.
+@@ -33,7 +33,9 @@ because GL_ARB_compatibility is not supported.
+ 
+ <h2>MD5 checksums</h2>
+ <pre>
+-tbd
++86d40f3056f89949368764bf84aff55e  MesaLib-9.1.tar.gz
++d3891e02215422e120271d976ff1947e  MesaLib-9.1.tar.bz2
++01645f28f53351c23b0beb6c688911d8  MesaLib-9.1.zip
+ </pre>
+ 
+ 
+diff --git a/docs/relnotes.html b/docs/relnotes.html
+index e373091..2e11bc4 100644
+--- a/docs/relnotes.html
++++ b/docs/relnotes.html
+@@ -22,6 +22,7 @@ The release notes summarize what's new or changed in each Mesa release.
+ 
+ <ul>
+ <li><a href="relnotes-9.1.html">9.1 release notes</a>
++<li><a href="relnotes-9.0.3.html">9.0.3 release notes</a>
+ <li><a href="relnotes-9.0.2.html">9.0.2 release notes</a>
+ <li><a href="relnotes-9.0.1.html">9.0.1 release notes</a>
+ <li><a href="relnotes-9.0.html">9.0 release notes</a>
+diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h
+index 09dca5b..1e388f8 100644
+--- a/include/pci_ids/i965_pci_ids.h
++++ b/include/pci_ids/i965_pci_ids.h
+@@ -53,12 +53,12 @@ CHIPSET(0x0A26, HASWELL_ULT_M_GT2_PLUS, hsw_gt2)
+ CHIPSET(0x0A0A, HASWELL_ULT_S_GT1, hsw_gt1)
+ CHIPSET(0x0A1A, HASWELL_ULT_S_GT2, hsw_gt2)
+ CHIPSET(0x0A2A, HASWELL_ULT_S_GT2_PLUS, hsw_gt2)
+-CHIPSET(0x0D12, HASWELL_CRW_GT1, hsw_gt1)
+-CHIPSET(0x0D22, HASWELL_CRW_GT2, hsw_gt2)
+-CHIPSET(0x0D32, HASWELL_CRW_GT2_PLUS, hsw_gt2)
+-CHIPSET(0x0D16, HASWELL_CRW_M_GT1, hsw_gt1)
+-CHIPSET(0x0D26, HASWELL_CRW_M_GT2, hsw_gt2)
+-CHIPSET(0x0D36, HASWELL_CRW_M_GT2_PLUS, hsw_gt2)
+-CHIPSET(0x0D1A, HASWELL_CRW_S_GT1, hsw_gt1)
+-CHIPSET(0x0D2A, HASWELL_CRW_S_GT2, hsw_gt2)
+-CHIPSET(0x0D3A, HASWELL_CRW_S_GT2_PLUS, hsw_gt2)
++CHIPSET(0x0D02, HASWELL_CRW_GT1, hsw_gt1)
++CHIPSET(0x0D12, HASWELL_CRW_GT2, hsw_gt2)
++CHIPSET(0x0D22, HASWELL_CRW_GT2_PLUS, hsw_gt2)
++CHIPSET(0x0D06, HASWELL_CRW_M_GT1, hsw_gt1)
++CHIPSET(0x0D16, HASWELL_CRW_M_GT2, hsw_gt2)
++CHIPSET(0x0D26, HASWELL_CRW_M_GT2_PLUS, hsw_gt2)
++CHIPSET(0x0D0A, HASWELL_CRW_S_GT1, hsw_gt1)
++CHIPSET(0x0D1A, HASWELL_CRW_S_GT2, hsw_gt2)
++CHIPSET(0x0D2A, HASWELL_CRW_S_GT2_PLUS, hsw_gt2)
+diff --git a/include/pci_ids/r600_pci_ids.h b/include/pci_ids/r600_pci_ids.h
+index 7ceb820..9c9bab2 100644
+--- a/include/pci_ids/r600_pci_ids.h
++++ b/include/pci_ids/r600_pci_ids.h
+@@ -298,6 +298,10 @@ CHIPSET(0x9907, ARUBA_9907, ARUBA)
+ CHIPSET(0x9908, ARUBA_9908, ARUBA)
+ CHIPSET(0x9909, ARUBA_9909, ARUBA)
+ CHIPSET(0x990A, ARUBA_990A, ARUBA)
++CHIPSET(0x990B, ARUBA_990B, ARUBA)
++CHIPSET(0x990C, ARUBA_990C, ARUBA)
++CHIPSET(0x990D, ARUBA_990D, ARUBA)
++CHIPSET(0x990E, ARUBA_990E, ARUBA)
+ CHIPSET(0x990F, ARUBA_990F, ARUBA)
+ CHIPSET(0x9910, ARUBA_9910, ARUBA)
+ CHIPSET(0x9913, ARUBA_9913, ARUBA)
+@@ -309,6 +313,13 @@ CHIPSET(0x9991, ARUBA_9991, ARUBA)
+ CHIPSET(0x9992, ARUBA_9992, ARUBA)
+ CHIPSET(0x9993, ARUBA_9993, ARUBA)
+ CHIPSET(0x9994, ARUBA_9994, ARUBA)
++CHIPSET(0x9995, ARUBA_9995, ARUBA)
++CHIPSET(0x9996, ARUBA_9996, ARUBA)
++CHIPSET(0x9997, ARUBA_9997, ARUBA)
++CHIPSET(0x9998, ARUBA_9998, ARUBA)
++CHIPSET(0x9999, ARUBA_9999, ARUBA)
++CHIPSET(0x999A, ARUBA_999A, ARUBA)
++CHIPSET(0x999B, ARUBA_999B, ARUBA)
+ CHIPSET(0x99A0, ARUBA_99A0, ARUBA)
+ CHIPSET(0x99A2, ARUBA_99A2, ARUBA)
+ CHIPSET(0x99A4, ARUBA_99A4, ARUBA)
+diff --git a/scons/gallium.py b/scons/gallium.py
+index 4b51b6e..b28be5d 100755
+--- a/scons/gallium.py
++++ b/scons/gallium.py
+@@ -289,6 +289,7 @@ def generate(env):
+                 '_CRT_SECURE_NO_DEPRECATE',
+                 '_SCL_SECURE_NO_WARNINGS',
+                 '_SCL_SECURE_NO_DEPRECATE',
++                '_ALLOW_KEYWORD_MACROS',
+             ]
+         if env['build'] in ('debug', 'checked'):
+             cppdefines += ['_DEBUG']
+@@ -401,6 +402,8 @@ def generate(env):
+               '/Oi', # enable intrinsic functions
+             ]
+         else:
++            if distutils.version.LooseVersion(env['MSVC_VERSION']) < distutils.version.LooseVersion('11.0'):
++                print 'scons: warning: Visual Studio versions prior to 2012 are known to produce incorrect code when optimizations are enabled ( https://bugs.freedesktop.org/show_bug.cgi?id=58718 )'
+             ccflags += [
+                 '/O2', # optimize for speed
+             ]
+diff --git a/scons/llvm.py b/scons/llvm.py
+index e1ed760..7f00c6c 100644
+--- a/scons/llvm.py
++++ b/scons/llvm.py
+@@ -92,7 +92,19 @@ def generate(env):
+             'HAVE_STDINT_H',
+         ])
+         env.Prepend(LIBPATH = [os.path.join(llvm_dir, 'lib')])
+-        if llvm_version >= distutils.version.LooseVersion('3.0'):
++        if llvm_version >= distutils.version.LooseVersion('3.2'):
++            # 3.2
++            env.Prepend(LIBS = [
++                'LLVMBitWriter', 'LLVMX86Disassembler', 'LLVMX86AsmParser',
++                'LLVMX86CodeGen', 'LLVMX86Desc', 'LLVMSelectionDAG',
++                'LLVMAsmPrinter', 'LLVMMCParser', 'LLVMX86AsmPrinter',
++                'LLVMX86Utils', 'LLVMX86Info', 'LLVMJIT',
++                'LLVMExecutionEngine', 'LLVMCodeGen', 'LLVMScalarOpts',
++                'LLVMInstCombine', 'LLVMTransformUtils', 'LLVMipa',
++                'LLVMAnalysis', 'LLVMTarget', 'LLVMMC', 'LLVMCore',
++                'LLVMSupport', 'LLVMRuntimeDyld', 'LLVMObject'
++            ])
++        elif llvm_version >= distutils.version.LooseVersion('3.0'):
+             # 3.0
+             env.Prepend(LIBS = [
+                 'LLVMBitWriter', 'LLVMX86Disassembler', 'LLVMX86AsmParser',
+diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
+index 351fbf4..e17d5be 100644
+--- a/src/egl/drivers/dri2/egl_dri2.c
++++ b/src/egl/drivers/dri2/egl_dri2.c
+@@ -195,7 +195,14 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
+       for (i = 0; attr_list[i] != EGL_NONE; i += 2)
+          _eglSetConfigKey(&base, attr_list[i], attr_list[i+1]);
+ 
+-   if (depth > 0 && depth != base.BufferSize)
++   /* Allow a 24-bit RGB visual to match a 32-bit RGBA EGLConfig.  Otherwise
++    * it will only match a 32-bit RGBA visual.  On a composited window manager
++    * on X11, this will make all of the EGLConfigs with destination alpha get
++    * blended by the compositor.  This is probably not what the application
++    * wants... especially on drivers that only have 32-bit RGBA EGLConfigs!
++    */
++   if (depth > 0 && depth != base.BufferSize
++       && !(depth == 24 && base.BufferSize == 32))
+       return NULL;
+ 
+    if (rgba_masks && memcmp(rgba_masks, dri_masks, sizeof(dri_masks)))
+diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
+index 7b879c4..3110809 100644
+--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
++++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
+@@ -167,12 +167,17 @@ static void interp( const struct clip_stage *clip,
+    {
+       int k;
+       t_nopersp = t;
+-      for (k = 0; k < 2; k++)
++      /* find either in.x != out.x or in.y != out.y */
++      for (k = 0; k < 2; k++) {
+          if (in->clip[k] != out->clip[k]) {
+-            t_nopersp = (dst->clip[k] - out->clip[k]) /
+-               (in->clip[k] - out->clip[k]);
++            /* do divide by W, then compute linear interpolation factor */
++            float in_coord = in->clip[k] / in->clip[3];
++            float out_coord = out->clip[k] / out->clip[3];
++            float dst_coord = dst->clip[k] / dst->clip[3];
++            t_nopersp = (dst_coord - out_coord) / (in_coord - out_coord);
+             break;
+          }
++      }
+    }
+ 
+    /* Other attributes
+diff --git a/src/gallium/auxiliary/draw/draw_pipe_offset.c b/src/gallium/auxiliary/draw/draw_pipe_offset.c
+index 3da52b1..3578525 100644
+--- a/src/gallium/auxiliary/draw/draw_pipe_offset.c
++++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c
+@@ -127,10 +127,44 @@ static void offset_first_tri( struct draw_stage *stage,
+ 			      struct prim_header *header )
+ {
+    struct offset_stage *offset = offset_stage(stage);
++   const struct pipe_rasterizer_state *rast = stage->draw->rasterizer;
++   unsigned fill_mode = rast->fill_front;
++   boolean do_offset;
++
++   if (rast->fill_back != rast->fill_front) {
++      /* Need to check for back-facing triangle */
++      boolean ccw = header->det < 0.0f;
++      if (ccw != rast->front_ccw)
++         fill_mode = rast->fill_back;
++   }
++
++   /* Now determine if we need to do offsetting for the point/line/fill mode */
++   switch (fill_mode) {
++   case PIPE_POLYGON_MODE_FILL:
++      do_offset = rast->offset_tri;
++      break;
++   case PIPE_POLYGON_MODE_LINE:
++      do_offset = rast->offset_line;
++      break;
++   case PIPE_POLYGON_MODE_POINT:
++      do_offset = rast->offset_point;
++      break;
++   default:
++      assert(!"invalid fill_mode in offset_first_tri()");
++      do_offset = rast->offset_tri;
++   }
++
++   if (do_offset) {
++      offset->scale = rast->offset_scale;
++      offset->clamp = rast->offset_clamp;
++      offset->units = (float) (rast->offset_units * stage->draw->mrd);
++   }
++   else {
++      offset->scale = 0.0f;
++      offset->clamp = 0.0f;
++      offset->units = 0.0f;
++   }
+ 
+-   offset->units = (float) (stage->draw->rasterizer->offset_units * stage->draw->mrd);
+-   offset->scale = stage->draw->rasterizer->offset_scale;
+-   offset->clamp = stage->draw->rasterizer->offset_clamp;
+ 
+    stage->tri = offset_tri;
+    stage->tri( stage, header );
+diff --git a/src/gallium/auxiliary/util/u_range.h b/src/gallium/auxiliary/util/u_range.h
+new file mode 100644
+index 0000000..4b1d0d1
+--- /dev/null
++++ b/src/gallium/auxiliary/util/u_range.h
+@@ -0,0 +1,89 @@
++/*
++ * Copyright 2013 Marek Olšák <maraeo@gmail.com>
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * on the rights to use, copy, modify, merge, publish, distribute, sub
++ * license, and/or sell copies of the Software, and to permit persons to whom
++ * the Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
++ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
++ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
++ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
++ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
++
++/**
++ * @file
++ * 1D integer range, capable of the union and intersection operations.
++ *
++ * It only maintains a single interval which is extended when the union is
++ * done. This implementation is partially thread-safe (readers are not
++ * protected by a lock).
++ *
++ * @author Marek Olšák
++ */
++
++#ifndef U_RANGE_H
++#define U_RANGE_H
++
++#include "os/os_thread.h"
++
++struct util_range {
++   unsigned start; /* inclusive */
++   unsigned end; /* exclusive */
++
++   /* for the range to be consistent with multiple contexts: */
++   pipe_mutex write_mutex;
++};
++
++
++static INLINE void
++util_range_set_empty(struct util_range *range)
++{
++   range->start = ~0;
++   range->end = 0;
++}
++
++/* This is like a union of two sets. */
++static INLINE void
++util_range_add(struct util_range *range, unsigned start, unsigned end)
++{
++   if (start < range->start || end > range->end) {
++      pipe_mutex_lock(range->write_mutex);
++      range->start = MIN2(start, range->start);
++      range->end = MAX2(end, range->end);
++      pipe_mutex_unlock(range->write_mutex);
++   }
++}
++
++static INLINE boolean
++util_ranges_intersect(struct util_range *range, unsigned start, unsigned end)
++{
++   return MAX2(start, range->start) < MIN2(end, range->end);
++}
++
++
++/* Init/deinit */
++
++static INLINE void
++util_range_init(struct util_range *range)
++{
++   pipe_mutex_init(range->write_mutex);
++   util_range_set_empty(range);
++}
++
++static INLINE void
++util_range_destroy(struct util_range *range)
++{
++   pipe_mutex_destroy(range->write_mutex);
++}
++
++#endif
+diff --git a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
+index 40ccaf6..ca8df71 100644
+--- a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
++++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
+@@ -46,6 +46,10 @@ clear_flags(struct pipe_rasterizer_state *rast)
+ {
+    rast->light_twoside = 0;
+    rast->offset_tri = 0;
++   rast->offset_line = 0;
++   rast->offset_point = 0;
++   rast->offset_units = 0.0f;
++   rast->offset_scale = 0.0f;
+ }
+ 
+ 
+@@ -74,6 +78,8 @@ llvmpipe_create_rasterizer_state(struct pipe_context *pipe,
+     */
+    need_pipeline = (rast->fill_front != PIPE_POLYGON_MODE_FILL ||
+ 		    rast->fill_back != PIPE_POLYGON_MODE_FILL ||
++                    rast->offset_point ||
++                    rast->offset_line ||
+ 		    rast->point_smooth ||
+ 		    rast->line_smooth ||
+ 		    rast->line_stipple_enable ||
+diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
+index 2e9c6bf..f17a04a 100644
+--- a/src/gallium/drivers/llvmpipe/lp_texture.c
++++ b/src/gallium/drivers/llvmpipe/lp_texture.c
+@@ -295,7 +295,9 @@ llvmpipe_resource_create(struct pipe_screen *_screen,
+    /* assert(lpr->base.bind); */
+ 
+    if (resource_is_texture(&lpr->base)) {
+-      if (lpr->base.bind & PIPE_BIND_DISPLAY_TARGET) {
++      if (lpr->base.bind & (PIPE_BIND_DISPLAY_TARGET |
++                            PIPE_BIND_SCANOUT |
++                            PIPE_BIND_SHARED)) {
+          /* displayable surface */
+          if (!llvmpipe_displaytarget_layout(screen, lpr))
+             goto fail;
+diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
+index bb47530..bb43353 100644
+--- a/src/gallium/drivers/r600/evergreen_hw_context.c
++++ b/src/gallium/drivers/r600/evergreen_hw_context.c
+@@ -283,4 +283,7 @@ void evergreen_dma_copy(struct r600_context *rctx,
+ 		src_offset += csize << shift;
+ 		size -= csize;
+ 	}
++
++	util_range_add(&rdst->valid_buffer_range, dst_offset,
++		       dst_offset + size);
+ }
+diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
+index 389ad3c..804c037 100644
+--- a/src/gallium/drivers/r600/evergreen_state.c
++++ b/src/gallium/drivers/r600/evergreen_state.c
+@@ -808,6 +808,7 @@ static void *evergreen_create_dsa_state(struct pipe_context *ctx,
+ 	dsa->valuemask[1] = state->stencil[1].valuemask;
+ 	dsa->writemask[0] = state->stencil[0].writemask;
+ 	dsa->writemask[1] = state->stencil[1].writemask;
++	dsa->zwritemask = state->depth.writemask;
+ 
+ 	db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) |
+ 		S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
+@@ -1321,6 +1322,10 @@ void evergreen_init_color_surface_rat(struct r600_context *rctx,
+ 	 * elements. */
+ 	surf->cb_color_dim = pipe_buffer->width0;
+ 
++	/* Set the buffer range the GPU will have access to: */
++	util_range_add(&r600_resource(pipe_buffer)->valid_buffer_range,
++		       0, pipe_buffer->width0);
++
+ 	surf->cb_color_cmask = surf->cb_color_base;
+ 	surf->cb_color_cmask_slice = 0;
+ 	surf->cb_color_fmask = surf->cb_color_base;
+@@ -1405,10 +1410,15 @@ void evergreen_init_color_surface(struct r600_context *rctx,
+ 			S_028C74_NON_DISP_TILING_ORDER(non_disp_tiling) |
+ 		        S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);
+ 
+-	if (rctx->chip_class == CAYMAN && rtex->resource.b.b.nr_samples > 1) {
+-		unsigned log_samples = util_logbase2(rtex->resource.b.b.nr_samples);
+-		color_attrib |= S_028C74_NUM_SAMPLES(log_samples) |
+-				S_028C74_NUM_FRAGMENTS(log_samples);
++	if (rctx->chip_class == CAYMAN) {
++		color_attrib |=	S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] ==
++							   UTIL_FORMAT_SWIZZLE_1);
++
++		if (rtex->resource.b.b.nr_samples > 1) {
++			unsigned log_samples = util_logbase2(rtex->resource.b.b.nr_samples);
++			color_attrib |= S_028C74_NUM_SAMPLES(log_samples) |
++					S_028C74_NUM_FRAGMENTS(log_samples);
++		}
+ 	}
+ 
+ 	ntype = V_028C70_NUMBER_UNORM;
+@@ -1647,6 +1657,11 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
+ 	}
+ 	if (rctx->framebuffer.state.zsbuf) {
+ 		rctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
++
++		rtex = (struct r600_texture*)rctx->framebuffer.state.zsbuf->texture;
++		if (rtex->htile) {
++			rctx->flags |= R600_CONTEXT_FLUSH_AND_INV_DB_META;
++		}
+ 	}
+ 
+ 	util_copy_framebuffer_state(&rctx->framebuffer.state, state);
+@@ -2222,7 +2237,14 @@ static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_
+ 		}
+ 		db_render_override |= S_02800C_NOOP_CULL_DISABLE(1);
+ 	}
+-	if (rctx->db_state.rsurf && rctx->db_state.rsurf->htile_enabled) {
++	/* FIXME we should be able to use hyperz even if we are not writing to
++	 * zbuffer but somehow this trigger GPU lockup. See :
++	 *
++	 * https://bugs.freedesktop.org/show_bug.cgi?id=60848
++	 *
++	 * Disable hyperz for now if not writing to zbuffer.
++	 */
++	if (rctx->db_state.rsurf && rctx->db_state.rsurf->htile_enabled && rctx->zwritemask) {
+ 		/* FORCE_OFF means HiZ/HiS are determined by DB_SHADER_CONTROL */
+ 		db_render_override |= S_02800C_FORCE_HIZ_ENABLE(V_02800C_FORCE_OFF);
+ 		/* This is to fix a lockup when hyperz and alpha test are enabled at
+@@ -3654,6 +3676,17 @@ boolean evergreen_dma_blit(struct pipe_context *ctx,
+ 		return FALSE;
+ 	}
+ 
++	/* 128 bpp surfaces require non_disp_tiling for both
++	 * tiled and linear buffers on cayman.  However, async
++	 * DMA only supports it on the tiled side.  As such
++	 * the tile order is backwards after a L2T/T2L packet.
++	 */
++	if ((rctx->chip_class == CAYMAN) &&
++	    (src_mode != dst_mode) &&
++	    (util_format_get_blocksize(src->format) >= 16)) {
++		return FALSE;
++	}
++
+ 	if (src_mode == dst_mode) {
+ 		uint64_t dst_offset, src_offset;
+ 		/* simple dma blit would do NOTE code here assume :
+diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h
+index 11dbb3b..0115293 100644
+--- a/src/gallium/drivers/r600/r600.h
++++ b/src/gallium/drivers/r600/r600.h
+@@ -28,6 +28,7 @@
+ 
+ #include "../../winsys/radeon/drm/radeon_winsys.h"
+ #include "util/u_double_list.h"
++#include "util/u_range.h"
+ #include "util/u_transfer.h"
+ 
+ #define R600_ERR(fmt, args...) \
+@@ -50,6 +51,16 @@ struct r600_resource {
+ 
+ 	/* Resource state. */
+ 	unsigned			domains;
++
++	/* The buffer range which is initialized (with a write transfer,
++	 * streamout, DMA, or as a random access target). The rest of
++	 * the buffer is considered invalid and can be mapped unsynchronized.
++	 *
++	 * This allows unsychronized mapping of a buffer range which hasn't
++	 * been used yet. It's for applications which forget to use
++	 * the unsynchronized map flag and expect the driver to figure it out.
++         */
++	struct util_range		valid_buffer_range;
+ };
+ 
+ #define R600_BLOCK_MAX_BO		32
+@@ -152,6 +163,7 @@ struct r600_so_target {
+ #define R600_CONTEXT_FLUSH_AND_INV		(1 << 4)
+ #define R600_CONTEXT_FLUSH_AND_INV_CB_META	(1 << 5)
+ #define R600_CONTEXT_PS_PARTIAL_FLUSH		(1 << 6)
++#define R600_CONTEXT_FLUSH_AND_INV_DB_META      (1 << 7)
+ 
+ struct r600_context;
+ struct r600_screen;
+diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
+index f25c6aa..bda425c 100644
+--- a/src/gallium/drivers/r600/r600_asm.c
++++ b/src/gallium/drivers/r600/r600_asm.c
+@@ -322,6 +322,7 @@ int r600_bytecode_add_output(struct r600_bytecode *bc, const struct r600_bytecod
+ 		output->swizzle_y == bc->cf_last->output.swizzle_y &&
+ 		output->swizzle_z == bc->cf_last->output.swizzle_z &&
+ 		output->swizzle_w == bc->cf_last->output.swizzle_w &&
++		output->comp_mask == bc->cf_last->output.comp_mask &&
+ 		(output->burst_count + bc->cf_last->output.burst_count) <= 16) {
+ 
+ 		if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
+@@ -873,12 +874,6 @@ static int check_and_set_bank_swizzle(struct r600_bytecode *bc,
+ 	bank_swizzle[4] = SQ_ALU_SCL_210;
+ 	while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
+ 
+-		if (max_slots == 4) {
+-			for (i = 0; i < max_slots; i++) {
+-				if (bank_swizzle[i] == SQ_ALU_VEC_210)
+-				  return -1;
+-			}
+-		}
+ 		init_bank_swizzle(&bs);
+ 		if (scalar_only == false) {
+ 			for (i = 0; i < 4; i++) {
+@@ -910,8 +905,10 @@ static int check_and_set_bank_swizzle(struct r600_bytecode *bc,
+ 					bank_swizzle[i]++;
+ 					if (bank_swizzle[i] <= SQ_ALU_VEC_210)
+ 						break;
+-					else
++					else if (i < max_slots - 1)
+ 						bank_swizzle[i] = SQ_ALU_VEC_012;
++					else
++						return -1;
+ 				}
+ 			}
+ 		}
+diff --git a/src/gallium/drivers/r600/r600_buffer.c b/src/gallium/drivers/r600/r600_buffer.c
+index 6df0d91..bb85fc1 100644
+--- a/src/gallium/drivers/r600/r600_buffer.c
++++ b/src/gallium/drivers/r600/r600_buffer.c
+@@ -34,6 +34,7 @@ static void r600_buffer_destroy(struct pipe_screen *screen,
+ {
+ 	struct r600_resource *rbuffer = r600_resource(buf);
+ 
++	util_range_destroy(&rbuffer->valid_buffer_range);
+ 	pb_reference(&rbuffer->buf, NULL);
+ 	FREE(rbuffer);
+ }
+@@ -98,6 +99,14 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
+ 
+ 	assert(box->x + box->width <= resource->width0);
+ 
++	/* See if the buffer range being mapped has never been initialized,
++	 * in which case it can be mapped unsynchronized. */
++	if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
++	    usage & PIPE_TRANSFER_WRITE &&
++	    !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
++		usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
++	}
++
+ 	if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
+ 	    !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+ 		assert(usage & PIPE_TRANSFER_WRITE);
+@@ -178,6 +187,7 @@ static void r600_buffer_transfer_unmap(struct pipe_context *pipe,
+ {
+ 	struct r600_context *rctx = (struct r600_context*)pipe;
+ 	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
++	struct r600_resource *rbuffer = r600_resource(transfer->resource);
+ 
+ 	if (rtransfer->staging) {
+ 		struct pipe_resource *dst, *src;
+@@ -189,7 +199,7 @@ static void r600_buffer_transfer_unmap(struct pipe_context *pipe,
+ 		doffset = transfer->box.x;
+ 		soffset = rtransfer->offset + transfer->box.x % R600_MAP_BUFFER_ALIGNMENT;
+ 		/* Copy the staging buffer into the original one. */
+-		if (rctx->rings.dma.cs && !(size % 4) && !(doffset % 4) && !(soffset)) {
++		if (rctx->rings.dma.cs && !(size % 4) && !(doffset % 4) && !(soffset % 4)) {
+ 			if (rctx->screen->chip_class >= EVERGREEN) {
+ 				evergreen_dma_copy(rctx, dst, src, doffset, soffset, size);
+ 			} else {
+@@ -203,6 +213,11 @@ static void r600_buffer_transfer_unmap(struct pipe_context *pipe,
+ 		}
+ 		pipe_resource_reference((struct pipe_resource**)&rtransfer->staging, NULL);
+ 	}
++
++	if (transfer->usage & PIPE_TRANSFER_WRITE) {
++		util_range_add(&rbuffer->valid_buffer_range, transfer->box.x,
++			       transfer->box.x + transfer->box.width);
++	}
+ 	util_slab_free(&rctx->pool_transfers, transfer);
+ }
+ 
+@@ -259,6 +274,7 @@ bool r600_init_resource(struct r600_screen *rscreen,
+ 
+ 	res->cs_buf = rscreen->ws->buffer_get_cs_handle(res->buf);
+ 	res->domains = domains;
++	util_range_set_empty(&res->valid_buffer_range);
+ 	return true;
+ }
+ 
+@@ -275,6 +291,7 @@ struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
+ 	pipe_reference_init(&rbuffer->b.b.reference, 1);
+ 	rbuffer->b.b.screen = screen;
+ 	rbuffer->b.vtbl = &r600_buffer_vtbl;
++	util_range_init(&rbuffer->valid_buffer_range);
+ 
+ 	if (!r600_init_resource(rscreen, rbuffer, templ->width0, alignment, TRUE, templ->usage)) {
+ 		FREE(rbuffer);
+diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
+index 9091ec0..322381a 100644
+--- a/src/gallium/drivers/r600/r600_hw_context.c
++++ b/src/gallium/drivers/r600/r600_hw_context.c
+@@ -648,6 +648,12 @@ void r600_flush_emit(struct r600_context *rctx)
+ 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0);
+ 	}
+ 
++	if (rctx->chip_class >= R700 &&
++	    (rctx->flags & R600_CONTEXT_FLUSH_AND_INV_DB_META)) {
++		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
++		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0);
++	}
++
+ 	if (rctx->flags & R600_CONTEXT_FLUSH_AND_INV) {
+ 		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+ 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0);
+@@ -742,6 +748,7 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags)
+ 	 */
+ 	ctx->flags |= R600_CONTEXT_FLUSH_AND_INV |
+ 		      R600_CONTEXT_FLUSH_AND_INV_CB_META |
++		      R600_CONTEXT_FLUSH_AND_INV_DB_META |
+ 		      R600_CONTEXT_WAIT_3D_IDLE |
+ 		      R600_CONTEXT_WAIT_CP_DMA_IDLE;
+ 
+@@ -1119,6 +1126,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
+ 	rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES |
+ 		       R600_CONTEXT_FLUSH_AND_INV |
+ 		       R600_CONTEXT_FLUSH_AND_INV_CB_META |
++		       R600_CONTEXT_FLUSH_AND_INV_DB_META |
+ 		       R600_CONTEXT_STREAMOUT_FLUSH |
+ 		       R600_CONTEXT_WAIT_3D_IDLE;
+ 
+@@ -1164,6 +1172,9 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
+ 
+ 	/* Invalidate the read caches. */
+ 	rctx->flags |= R600_CONTEXT_INVAL_READ_CACHES;
++
++	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
++		       dst_offset + size);
+ }
+ 
+ void r600_need_dma_space(struct r600_context *ctx, unsigned num_dw)
+@@ -1210,4 +1221,7 @@ void r600_dma_copy(struct r600_context *rctx,
+ 		src_offset += csize << shift;
+ 		size -= csize;
+ 	}
++
++	util_range_add(&rdst->valid_buffer_range, dst_offset,
++		       dst_offset + size);
+ }
+diff --git a/src/gallium/drivers/r600/r600_hw_context_priv.h b/src/gallium/drivers/r600/r600_hw_context_priv.h
+index 692e6ec..3b50f68 100644
+--- a/src/gallium/drivers/r600/r600_hw_context_priv.h
++++ b/src/gallium/drivers/r600/r600_hw_context_priv.h
+@@ -29,7 +29,7 @@
+ #include "r600_pipe.h"
+ 
+ /* the number of CS dwords for flushing and drawing */
+-#define R600_MAX_FLUSH_CS_DWORDS	12
++#define R600_MAX_FLUSH_CS_DWORDS	16
+ #define R600_MAX_DRAW_CS_DWORDS		34
+ #define R600_TRACE_CS_DWORDS		7
+ 
+diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
+index fa66fcc..7a41688 100644
+--- a/src/gallium/drivers/r600/r600_llvm.c
++++ b/src/gallium/drivers/r600/r600_llvm.c
+@@ -38,8 +38,12 @@ static LLVMValueRef llvm_fetch_const(
+ 		LLVMValueRef index = LLVMBuildLoad(bld_base->base.gallivm->builder, bld->addr[reg->Indirect.Index][reg->Indirect.SwizzleX], "");
+ 		offset[1] = LLVMBuildAdd(bld_base->base.gallivm->builder, offset[1], index, "");
+ 	}
++	unsigned ConstantAddressSpace = CONSTANT_BUFFER_0_ADDR_SPACE ;
++	if (reg->Register.Dimension) {
++		ConstantAddressSpace += reg->Dimension.Index;
++	}
+ 	LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base->base.elem_type, 4), 1024),
+-							CONSTANT_BUFFER_0_ADDR_SPACE);
++							ConstantAddressSpace);
+ 	LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base->base.gallivm->builder, lp_build_const_int32(bld_base->base.gallivm, 0), const_ptr_type, "");
+ 	LLVMValueRef ptr = LLVMBuildGEP(bld_base->base.gallivm->builder, const_ptr, offset, 2, "");
+ 	LLVMValueRef cvecval = LLVMBuildLoad(bld_base->base.gallivm->builder, ptr, "");
+diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
+index a59578d..a7973a5 100644
+--- a/src/gallium/drivers/r600/r600_pipe.c
++++ b/src/gallium/drivers/r600/r600_pipe.c
+@@ -22,6 +22,7 @@
+  */
+ #include "r600_pipe.h"
+ #include "r600_public.h"
++#include "r600d.h"
+ 
+ #include <errno.h>
+ #include "pipe/p_shader_tokens.h"
+@@ -165,12 +166,23 @@ static void r600_flush_gfx_ring(void *ctx, unsigned flags)
+ static void r600_flush_dma_ring(void *ctx, unsigned flags)
+ {
+ 	struct r600_context *rctx = (struct r600_context *)ctx;
++	struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
++	unsigned padding_dw, i;
+ 
+-	if (!rctx->rings.dma.cs->cdw) {
++	if (!cs->cdw) {
+ 		return;
+ 	}
++
++	/* Pad the DMA CS to a multiple of 8 dwords. */
++	padding_dw = 8 - cs->cdw % 8;
++	if (padding_dw < 8) {
++		for (i = 0; i < padding_dw; i++) {
++			cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0);
++		}
++	}
++
+ 	rctx->rings.dma.flushing = true;
+-	rctx->ws->cs_flush(rctx->rings.dma.cs, flags);
++	rctx->ws->cs_flush(cs, flags);
+ 	rctx->rings.dma.flushing = false;
+ }
+ 
+diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
+index ec59c92..1be4321 100644
+--- a/src/gallium/drivers/r600/r600_pipe.h
++++ b/src/gallium/drivers/r600/r600_pipe.h
+@@ -298,7 +298,8 @@ struct r600_dsa_state {
+ 	unsigned			alpha_ref;
+ 	ubyte				valuemask[2];
+ 	ubyte				writemask[2];
+-	unsigned                        sx_alpha_test_control;
++	unsigned			zwritemask;
++	unsigned			sx_alpha_test_control;
+ };
+ 
+ struct r600_pipe_shader;
+@@ -513,6 +514,7 @@ struct r600_context {
+ 	bool				alpha_to_one;
+ 	bool				force_blend_disable;
+ 	boolean				dual_src_blend;
++	unsigned			zwritemask;
+ 
+ 	/* Index buffer. */
+ 	struct pipe_index_buffer	index_buffer;
+diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
+index 3f165f7..70232fd 100644
+--- a/src/gallium/drivers/r600/r600_state.c
++++ b/src/gallium/drivers/r600/r600_state.c
+@@ -802,6 +802,7 @@ static void *r600_create_dsa_state(struct pipe_context *ctx,
+ 	dsa->valuemask[1] = state->stencil[1].valuemask;
+ 	dsa->writemask[0] = state->stencil[0].writemask;
+ 	dsa->writemask[1] = state->stencil[1].writemask;
++	dsa->zwritemask = state->depth.writemask;
+ 
+ 	db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) |
+ 		S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
+@@ -1515,6 +1516,11 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
+ 	}
+ 	if (rctx->framebuffer.state.zsbuf) {
+ 		rctx->flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
++
++		rtex = (struct r600_texture*)rctx->framebuffer.state.zsbuf->texture;
++		if (rctx->chip_class >= R700 && rtex->htile) {
++			rctx->flags |= R600_CONTEXT_FLUSH_AND_INV_DB_META;
++		}
+ 	}
+ 
+ 	/* Set the new state. */
+diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
+index 88bb62b..f0e9de3 100644
+--- a/src/gallium/drivers/r600/r600_state_common.c
++++ b/src/gallium/drivers/r600/r600_state_common.c
+@@ -284,6 +284,16 @@ static void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
+ 	ref.valuemask[1] = dsa->valuemask[1];
+ 	ref.writemask[0] = dsa->writemask[0];
+ 	ref.writemask[1] = dsa->writemask[1];
++	if (rctx->zwritemask != dsa->zwritemask) {
++		rctx->zwritemask = dsa->zwritemask;
++		if (rctx->chip_class >= EVERGREEN) {
++			/* work around some issue when not writting to zbuffer
++			 * we are having lockup on evergreen so do not enable
++			 * hyperz when not writting zbuffer
++			 */
++			rctx->db_misc_state.atom.dirty = true;
++		}
++	}
+ 
+ 	r600_set_stencil_ref(ctx, &ref);
+ 
+@@ -972,6 +982,7 @@ r600_create_so_target(struct pipe_context *ctx,
+ {
+ 	struct r600_context *rctx = (struct r600_context *)ctx;
+ 	struct r600_so_target *t;
++	struct r600_resource *rbuffer = (struct r600_resource*)buffer;
+ 
+ 	t = CALLOC_STRUCT(r600_so_target);
+ 	if (!t) {
+@@ -991,6 +1002,9 @@ r600_create_so_target(struct pipe_context *ctx,
+ 	pipe_resource_reference(&t->b.buffer, buffer);
+ 	t->b.buffer_offset = buffer_offset;
+ 	t->b.buffer_size = buffer_size;
++
++	util_range_add(&rbuffer->valid_buffer_range, buffer_offset,
++		       buffer_offset + buffer_size);
+ 	return &t->b;
+ }
+ 
+diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
+index 621e7a1..81e5a6c 100644
+--- a/src/gallium/drivers/r600/r600d.h
++++ b/src/gallium/drivers/r600/r600d.h
+@@ -119,6 +119,7 @@
+ #define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT   0x16
+ #define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH	0x1f
+ #define EVENT_TYPE_SAMPLE_STREAMOUTSTATS	0x20
++#define EVENT_TYPE_FLUSH_AND_INV_DB_META       0x2c /* supported on r700+ */
+ #define EVENT_TYPE_FLUSH_AND_INV_CB_META	46 /* supported on r700+ */
+ #define		EVENT_TYPE(x)                           ((x) << 0)
+ #define		EVENT_INDEX(x)                          ((x) << 8)
+diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+index 0f90991..8902ae4 100644
+--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
++++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+@@ -766,6 +766,22 @@ static void emit_icmp(
+ 	emit_data->output[emit_data->chan] = v;
+ }
+ 
++static void emit_ucmp(
++		const struct lp_build_tgsi_action * action,
++		struct lp_build_tgsi_context * bld_base,
++		struct lp_build_emit_data * emit_data)
++{
++	unsigned pred;
++	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
++	LLVMContextRef context = bld_base->base.gallivm->context;
++
++
++	LLVMValueRef v = LLVMBuildFCmp(builder, LLVMRealUGE,
++			emit_data->args[0], lp_build_const_float(bld_base->base.gallivm, 0.), "");
++
++	emit_data->output[emit_data->chan] = LLVMBuildSelect(builder, v, emit_data->args[2], emit_data->args[1], "");
++}
++
+ static void emit_cmp(
+ 		const struct lp_build_tgsi_action *action,
+ 		struct lp_build_tgsi_context * bld_base,
+@@ -1241,6 +1257,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
+ 	bld_base->op_actions[TGSI_OPCODE_USNE].emit = emit_icmp;
+ 	bld_base->op_actions[TGSI_OPCODE_U2F].emit = emit_u2f;
+ 	bld_base->op_actions[TGSI_OPCODE_XOR].emit = emit_xor;
++	bld_base->op_actions[TGSI_OPCODE_UCMP].emit = emit_ucmp;
+ 
+ 	bld_base->rsq_action.emit = build_tgsi_intrinsic_nomem;
+ 	bld_base->rsq_action.intr_name = "llvm.AMDGPU.rsq";
+diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c b/src/gallium/drivers/radeonsi/radeonsi_shader.c
+index 2545634..7922928 100644
+--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
++++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
+@@ -309,14 +309,8 @@ static void declare_input_fs(
+ 	/* XXX: Handle all possible interpolation modes */
+ 	switch (decl->Interp.Interpolate) {
+ 	case TGSI_INTERPOLATE_COLOR:
+-		/* XXX: Flat shading hangs the GPU */
+-		if (si_shader_ctx->rctx->queued.named.rasterizer &&
+-		    si_shader_ctx->rctx->queued.named.rasterizer->flatshade) {
+-#if 0
++		if (si_shader_ctx->key.flatshade) {
+ 			intr_name = "llvm.SI.fs.interp.constant";
+-#else
+-			intr_name = "llvm.SI.fs.interp.linear.center";
+-#endif
+ 		} else {
+ 			if (decl->Interp.Centroid)
+ 				intr_name = "llvm.SI.fs.interp.persp.centroid";
+@@ -325,11 +319,8 @@ static void declare_input_fs(
+ 		}
+ 		break;
+ 	case TGSI_INTERPOLATE_CONSTANT:
+-		/* XXX: Flat shading hangs the GPU */
+-#if 0
+ 		intr_name = "llvm.SI.fs.interp.constant";
+ 		break;
+-#endif
+ 	case TGSI_INTERPOLATE_LINEAR:
+ 		if (decl->Interp.Centroid)
+ 			intr_name = "llvm.SI.fs.interp.linear.centroid";
+diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.h b/src/gallium/drivers/radeonsi/radeonsi_shader.h
+index 07b2f9f..f54f67c 100644
+--- a/src/gallium/drivers/radeonsi/radeonsi_shader.h
++++ b/src/gallium/drivers/radeonsi/radeonsi_shader.h
+@@ -82,6 +82,7 @@ struct si_shader_key {
+ 	unsigned		nr_cbufs:4;
+ 	unsigned		color_two_side:1;
+ 	unsigned		alpha_func:3;
++	unsigned		flatshade:1;
+ 	float			alpha_ref;
+ };
+ 
+diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
+index a6b1983..39817fb 100644
+--- a/src/gallium/drivers/radeonsi/si_state.c
++++ b/src/gallium/drivers/radeonsi/si_state.c
+@@ -421,8 +421,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
+ 	rs->offset_units = state->offset_units;
+ 	rs->offset_scale = state->offset_scale * 12.0f;
+ 
+-	/* XXX: Flat shading hangs the GPU */
+-	tmp = S_0286D4_FLAT_SHADE_ENA(0);
++	tmp = S_0286D4_FLAT_SHADE_ENA(1);
+ 	if (state->sprite_coord_enable) {
+ 		tmp |= S_0286D4_PNT_SPRITE_ENA(1) |
+ 			S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
+@@ -1859,7 +1858,7 @@ static INLINE struct si_shader_key si_shader_selector_key(struct pipe_context *c
+ 		key.export_16bpc = rctx->export_16bpc;
+ 		if (rctx->queued.named.rasterizer) {
+ 			key.color_two_side = rctx->queued.named.rasterizer->two_side;
+-			/*key.flatshade = rctx->queued.named.rasterizer->flatshade;*/
++			key.flatshade = rctx->queued.named.rasterizer->flatshade;
+ 		}
+ 		if (rctx->queued.named.dsa) {
+ 			key.alpha_func = rctx->queued.named.dsa->alpha_func;
+diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
+index 3704410..8c35625 100644
+--- a/src/gallium/drivers/radeonsi/si_state_draw.c
++++ b/src/gallium/drivers/radeonsi/si_state_draw.c
+@@ -128,11 +128,6 @@ static void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *s
+ 			continue;
+ 		}
+ 
+-		/* XXX: Flat shading hangs the GPU */
+-		if (shader->shader.input[i].interpolate == TGSI_INTERPOLATE_CONSTANT ||
+-		    (shader->shader.input[i].interpolate == TGSI_INTERPOLATE_COLOR &&
+-		     rctx->queued.named.rasterizer->flatshade))
+-			have_linear = TRUE;
+ 		if (shader->shader.input[i].interpolate == TGSI_INTERPOLATE_LINEAR)
+ 			have_linear = TRUE;
+ 		if (shader->shader.input[i].interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
+@@ -327,15 +322,12 @@ static void si_update_spi_map(struct r600_context *rctx)
+ bcolor:
+ 		tmp = 0;
+ 
+-#if 0
+-		/* XXX: Flat shading hangs the GPU */
+ 		if (name == TGSI_SEMANTIC_POSITION ||
+ 		    ps->input[i].interpolate == TGSI_INTERPOLATE_CONSTANT ||
+ 		    (ps->input[i].interpolate == TGSI_INTERPOLATE_COLOR &&
+-		     rctx->rasterizer && rctx->rasterizer->flatshade)) {
++		     rctx->ps_shader->current->key.flatshade)) {
+ 			tmp |= S_028644_FLAT_SHADE(1);
+ 		}
+-#endif
+ 
+ 		if (name == TGSI_SEMANTIC_GENERIC &&
+ 		    rctx->sprite_coord_enable & (1 << ps->input[i].sid)) {
+@@ -453,8 +445,14 @@ static void si_vertex_buffer_update(struct r600_context *rctx)
+ 		si_pm4_sh_data_add(pm4, va & 0xFFFFFFFF);
+ 		si_pm4_sh_data_add(pm4, (S_008F04_BASE_ADDRESS_HI(va >> 32) |
+ 					 S_008F04_STRIDE(vb->stride)));
+-		si_pm4_sh_data_add(pm4, (vb->buffer->width0 - vb->buffer_offset) /
+-					 MAX2(vb->stride, 1));
++		if (vb->stride)
++			/* Round up by rounding down and adding 1 */
++			si_pm4_sh_data_add(pm4,
++					   (vb->buffer->width0 - offset -
++					    util_format_get_blocksize(ve->src_format)) /
++					   vb->stride + 1);
++		else
++			si_pm4_sh_data_add(pm4, vb->buffer->width0 - offset);
+ 		si_pm4_sh_data_add(pm4, rctx->vertex_elements->rsrc_word3[i]);
+ 
+ 		if (!bound[ve->vertex_buffer_index]) {
+diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.c b/src/gallium/state_trackers/glx/xlib/xm_api.c
+index 607584f..021175c 100644
+--- a/src/gallium/state_trackers/glx/xlib/xm_api.c
++++ b/src/gallium/state_trackers/glx/xlib/xm_api.c
+@@ -438,7 +438,6 @@ create_xmesa_buffer(Drawable d, BufferType type,
+ {
+    XMesaDisplay xmdpy = xmesa_init_display(vis->display);
+    XMesaBuffer b;
+-   uint width, height;
+ 
+    ASSERT(type == WINDOW || type == PIXMAP || type == PBUFFER);
+ 
+@@ -457,7 +456,7 @@ create_xmesa_buffer(Drawable d, BufferType type,
+    b->type = type;
+    b->cmap = cmap;
+ 
+-   get_drawable_size(vis->display, d, &width, &height);
++   get_drawable_size(vis->display, d, &b->width, &b->height);
+ 
+    /*
+     * Create framebuffer, but we'll plug in our own renderbuffers below.
+diff --git a/src/gallium/targets/dri-vmwgfx/Makefile.am b/src/gallium/targets/dri-vmwgfx/Makefile.am
+index 06ebf88..ca7df65 100644
+--- a/src/gallium/targets/dri-vmwgfx/Makefile.am
++++ b/src/gallium/targets/dri-vmwgfx/Makefile.am
+@@ -58,17 +58,13 @@ vmwgfx_dri_la_LIBADD = \
+ 	$(top_builddir)/src/gallium/drivers/svga/libsvga.la \
+ 	$(GALLIUM_DRI_LIB_DEPS)
+ 
+-if HAVE_MESA_LLVM
+ vmwgfx_dri_la_LINK = $(CXXLINK) $(vmwgfx_dri_la_LDFLAGS)
+ # Mention a dummy pure C++ file to trigger generation of the $(LINK) variable
+ nodist_EXTRA_vmwgfx_dri_la_SOURCES = dummy-cpp.cpp
+ 
++if HAVE_MESA_LLVM
+ vmwgfx_dri_la_LDFLAGS += $(LLVM_LDFLAGS)
+ vmwgfx_dri_la_LIBADD += $(LLVM_LIBS)
+-else
+-vmwgfx_dri_la_LINK = $(LINK) $(vmwgfx_dri_la_LDFLAGS)
+-# Mention a dummy pure C file to trigger generation of the $(LINK) variable
+-nodist_EXTRA_vmwgfx_dri_la_SOURCES = dummy-c.c
+ endif
+ 
+ # Provide compatibility with scripts for the old Mesa build system for
+diff --git a/src/gallium/targets/vdpau-softpipe/Makefile.am b/src/gallium/targets/vdpau-softpipe/Makefile.am
+index 3372b5c..7bde2f8 100644
+--- a/src/gallium/targets/vdpau-softpipe/Makefile.am
++++ b/src/gallium/targets/vdpau-softpipe/Makefile.am
+@@ -35,7 +35,7 @@ vdpaudir = $(VDPAU_LIB_INSTALL_DIR)
+ vdpau_LTLIBRARIES = libvdpau_softpipe.la
+ 
+ libvdpau_softpipe_la_SOURCES = \
+-	$(top_srcdir)/src/gallium/auxiliary/vl/vl_winsys_dri.c
++	$(top_srcdir)/src/gallium/auxiliary/vl/vl_winsys_xsp.c
+ 
+ libvdpau_softpipe_la_LDFLAGS = \
+ 	-module \
+diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+index 2d41c26..f4ac526 100644
+--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
++++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+@@ -957,16 +957,16 @@ static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer,
+ 
+             bo->flinked = TRUE;
+             bo->flink = flink.name;
++
++            pipe_mutex_lock(bo->mgr->bo_handles_mutex);
++            util_hash_table_set(bo->mgr->bo_handles, (void*)(uintptr_t)bo->flink, bo);
++            pipe_mutex_unlock(bo->mgr->bo_handles_mutex);
+         }
+         whandle->handle = bo->flink;
+     } else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+         whandle->handle = bo->handle;
+     }
+ 
+-    pipe_mutex_lock(bo->mgr->bo_handles_mutex);
+-    util_hash_table_set(bo->mgr->bo_handles, (void*)(uintptr_t)whandle->handle, bo);
+-    pipe_mutex_unlock(bo->mgr->bo_handles_mutex);
+-
+     whandle->stride = stride;
+     return TRUE;
+ }
+diff --git a/src/gbm/backends/dri/gbm_dri.c b/src/gbm/backends/dri/gbm_dri.c
+index 519929e..a3a0530 100644
+--- a/src/gbm/backends/dri/gbm_dri.c
++++ b/src/gbm/backends/dri/gbm_dri.c
+@@ -481,6 +481,7 @@ create_dumb(struct gbm_device *gbm,
+    bo->base.base.width = width;
+    bo->base.base.height = height;
+    bo->base.base.stride = create_arg.pitch;
++   bo->base.base.format = format;
+    bo->base.base.handle.u32 = create_arg.handle;
+    bo->handle = create_arg.handle;
+    bo->size = create_arg.size;
+@@ -529,6 +530,7 @@ gbm_dri_bo_create(struct gbm_device *gbm,
+    bo->base.base.gbm = gbm;
+    bo->base.base.width = width;
+    bo->base.base.height = height;
++   bo->base.base.format = format;
+ 
+    switch (format) {
+    case GBM_FORMAT_RGB565:
+diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
+index 4e32b50..29a209e 100644
+--- a/src/mesa/drivers/common/meta.c
++++ b/src/mesa/drivers/common/meta.c
+@@ -1910,6 +1910,14 @@ _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
+       GLuint *tmp = malloc(srcW * srcH * sizeof(GLuint));
+ 
+       if (tmp) {
++
++         newTex = alloc_texture(depthTex, srcW, srcH, GL_DEPTH_COMPONENT);
++         _mesa_ReadPixels(srcX, srcY, srcW, srcH, GL_DEPTH_COMPONENT,
++                          GL_UNSIGNED_INT, tmp);
++         setup_drawpix_texture(ctx, depthTex, newTex, GL_DEPTH_COMPONENT,
++                               srcW, srcH, GL_DEPTH_COMPONENT,
++                               GL_UNSIGNED_INT, tmp);
++
+          /* texcoords (after texture allocation!) */
+          {
+             verts[0].s = 0.0F;
+@@ -1928,15 +1936,6 @@ _mesa_meta_BlitFramebuffer(struct gl_context *ctx,
+          if (!blit->DepthFP)
+             init_blit_depth_pixels(ctx);
+ 
+-         /* maybe change tex format here */
+-         newTex = alloc_texture(depthTex, srcW, srcH, GL_DEPTH_COMPONENT);
+-
+-         _mesa_ReadPixels(srcX, srcY, srcW, srcH,
+-                          GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, tmp);
+-
+-         setup_drawpix_texture(ctx, depthTex, newTex, GL_DEPTH_COMPONENT, srcW, srcH,
+-                               GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, tmp);
+-
+          _mesa_BindProgramARB(GL_FRAGMENT_PROGRAM_ARB, blit->DepthFP);
+          _mesa_set_enable(ctx, GL_FRAGMENT_PROGRAM_ARB, GL_TRUE);
+          _mesa_ColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
+diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
+index dc140df..77670ef 100644
+--- a/src/mesa/drivers/dri/i965/Makefile.am
++++ b/src/mesa/drivers/dri/i965/Makefile.am
+@@ -62,6 +62,7 @@ TEST_LIBS = \
+ 	../common/libdri_test_stubs.la
+ 
+ i965_dri_la_SOURCES =
++nodist_EXTRA_i965_dri_la_SOURCES = dummy2.cpp
+ i965_dri_la_LIBADD = $(COMMON_LIBS)
+ i965_dri_la_LDFLAGS = -module -avoid-version -shared
+ 
+diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
+index 8dab431..f80219e 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
+@@ -258,6 +258,26 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
+    return instructions;
+ }
+ 
++/**
++ * A helper for MOV generation for fixing up broken hardware SEND dependency
++ * handling.
++ */
++fs_inst *
++fs_visitor::DEP_RESOLVE_MOV(int grf)
++{
++   fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
++
++   inst->ir = NULL;
++   inst->annotation = "send dependency resolve";
++
++   /* The caller always wants uncompressed to emit the minimal extra
++    * dependencies, and to avoid having to deal with aligning its regs to 2.
++    */
++   inst->force_uncompressed = true;
++
++   return inst;
++}
++
+ bool
+ fs_inst::equals(fs_inst *inst)
+ {
+@@ -1690,8 +1710,6 @@ fs_visitor::setup_pull_constants()
+                                  dst, index, offset);
+ 	 pull->ir = inst->ir;
+ 	 pull->annotation = inst->annotation;
+-	 pull->base_mrf = 14;
+-	 pull->mlen = 1;
+ 
+ 	 inst->insert_before(pull);
+ 
+@@ -1911,6 +1929,7 @@ fs_visitor::register_coalesce()
+ 
+       bool has_source_modifiers = (inst->src[0].abs ||
+                                    inst->src[0].negate ||
++                                   inst->src[0].smear != -1 ||
+                                    inst->src[0].file == UNIFORM);
+ 
+       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
+@@ -2228,6 +2247,265 @@ fs_visitor::remove_duplicate_mrf_writes()
+    return progress;
+ }
+ 
++static void
++clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
++                        int first_grf, int grf_len)
++{
++   bool inst_16wide = (dispatch_width > 8 &&
++                       !inst->force_uncompressed &&
++                       !inst->force_sechalf);
++
++   /* Clear the flag for registers that actually got read (as expected). */
++   for (int i = 0; i < 3; i++) {
++      int grf;
++      if (inst->src[i].file == GRF) {
++         grf = inst->src[i].reg;
++      } else if (inst->src[i].file == FIXED_HW_REG &&
++                 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
++         grf = inst->src[i].fixed_hw_reg.nr;
++      } else {
++         continue;
++      }
++
++      if (grf >= first_grf &&
++          grf < first_grf + grf_len) {
++         deps[grf - first_grf] = false;
++         if (inst_16wide)
++            deps[grf - first_grf + 1] = false;
++      }
++   }
++}
++
++/**
++ * Implements this workaround for the original 965:
++ *
++ *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
++ *      check for post destination dependencies on this instruction, software
++ *      must ensure that there is no destination hazard for the case of ‘write
++ *      followed by a posted write’ shown in the following example.
++ *
++ *      1. mov r3 0
++ *      2. send r3.xy <rest of send instruction>
++ *      3. mov r2 r3
++ *
++ *      Due to no post-destination dependency check on the ‘send’, the above
++ *      code sequence could have two instructions (1 and 2) in flight at the
++ *      same time that both consider ‘r3’ as the target of their final writes.
++ */
++void
++fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
++{
++   int write_len = inst->regs_written() * dispatch_width / 8;
++   int first_write_grf = inst->dst.reg;
++   bool needs_dep[BRW_MAX_MRF];
++   assert(write_len < (int)sizeof(needs_dep) - 1);
++
++   memset(needs_dep, false, sizeof(needs_dep));
++   memset(needs_dep, true, write_len);
++
++   clear_deps_for_inst_src(inst, dispatch_width,
++                           needs_dep, first_write_grf, write_len);
++
++   /* Walk backwards looking for writes to registers we're writing which
++    * aren't read since being written.  If we hit the start of the program,
++    * we assume that there are no outstanding dependencies on entry to the
++    * program.
++    */
++   for (fs_inst *scan_inst = (fs_inst *)inst->prev;
++        scan_inst != NULL;
++        scan_inst = (fs_inst *)scan_inst->prev) {
++
++      /* If we hit control flow, assume that there *are* outstanding
++       * dependencies, and force their cleanup before our instruction.
++       */
++      if (scan_inst->is_control_flow()) {
++         for (int i = 0; i < write_len; i++) {
++            if (needs_dep[i]) {
++               inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
++            }
++         }
++      }
++
++      bool scan_inst_16wide = (dispatch_width > 8 &&
++                               !scan_inst->force_uncompressed &&
++                               !scan_inst->force_sechalf);
++
++      /* We insert our reads as late as possible on the assumption that any
++       * instruction but a MOV that might have left us an outstanding
++       * dependency has more latency than a MOV.
++       */
++      if (scan_inst->dst.file == GRF &&
++          scan_inst->dst.reg >= first_write_grf &&
++          scan_inst->dst.reg < first_write_grf + write_len &&
++          needs_dep[scan_inst->dst.reg - first_write_grf]) {
++         inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
++         needs_dep[scan_inst->dst.reg - first_write_grf] = false;
++         if (scan_inst_16wide)
++            needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false;
++      }
++
++      /* Clear the flag for registers that actually got read (as expected). */
++      clear_deps_for_inst_src(scan_inst, dispatch_width,
++                              needs_dep, first_write_grf, write_len);
++
++      /* Continue the loop only if we haven't resolved all the dependencies */
++      int i;
++      for (i = 0; i < write_len; i++) {
++         if (needs_dep[i])
++            break;
++      }
++      if (i == write_len)
++         return;
++   }
++}
++
++/**
++ * Implements this workaround for the original 965:
++ *
++ *     "[DevBW, DevCL] Errata: A destination register from a send can not be
++ *      used as a destination register until after it has been sourced by an
++ *      instruction with a different destination register.
++ */
++void
++fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
++{
++   int write_len = inst->regs_written() * dispatch_width / 8;
++   int first_write_grf = inst->dst.reg;
++   bool needs_dep[BRW_MAX_MRF];
++   assert(write_len < (int)sizeof(needs_dep) - 1);
++
++   memset(needs_dep, false, sizeof(needs_dep));
++   memset(needs_dep, true, write_len);
++   /* Walk forwards looking for writes to registers we're writing which aren't
++    * read before being written.
++    */
++   for (fs_inst *scan_inst = (fs_inst *)inst->next;
++        !scan_inst->is_tail_sentinel();
++        scan_inst = (fs_inst *)scan_inst->next) {
++      /* If we hit control flow, force resolve all remaining dependencies. */
++      if (scan_inst->is_control_flow()) {
++         for (int i = 0; i < write_len; i++) {
++            if (needs_dep[i])
++               scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
++         }
++      }
++
++      /* Clear the flag for registers that actually got read (as expected). */
++      clear_deps_for_inst_src(scan_inst, dispatch_width,
++                              needs_dep, first_write_grf, write_len);
++
++      /* We insert our reads as late as possible since they're reading the
++       * result of a SEND, which has massive latency.
++       */
++      if (scan_inst->dst.file == GRF &&
++          scan_inst->dst.reg >= first_write_grf &&
++          scan_inst->dst.reg < first_write_grf + write_len &&
++          needs_dep[scan_inst->dst.reg - first_write_grf]) {
++         scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
++         needs_dep[scan_inst->dst.reg - first_write_grf] = false;
++      }
++
++      /* Continue the loop only if we haven't resolved all the dependencies */
++      int i;
++      for (i = 0; i < write_len; i++) {
++         if (needs_dep[i])
++            break;
++      }
++      if (i == write_len)
++         return;
++   }
++
++   /* If we hit the end of the program, resolve all remaining dependencies out
++    * of paranoia.
++    */
++   fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
++   assert(last_inst->eot);
++   for (int i = 0; i < write_len; i++) {
++      if (needs_dep[i])
++         last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
++   }
++}
++
++void
++fs_visitor::insert_gen4_send_dependency_workarounds()
++{
++   if (intel->gen != 4 || intel->is_g4x)
++      return;
++
++   /* Note that we're done with register allocation, so GRF fs_regs always
++    * have a .reg_offset of 0.
++    */
++
++   foreach_list_safe(node, &this->instructions) {
++      fs_inst *inst = (fs_inst *)node;
++
++      if (inst->mlen != 0 && inst->dst.file == GRF) {
++         insert_gen4_pre_send_dependency_workarounds(inst);
++         insert_gen4_post_send_dependency_workarounds(inst);
++      }
++   }
++}
++
++/**
++ * Turns the generic expression-style uniform pull constant load instruction
++ * into a hardware-specific series of instructions for loading a pull
++ * constant.
++ *
++ * The expression style allows the CSE pass before this to optimize out
++ * repeated loads from the same offset, and gives the pre-register-allocation
++ * scheduling full flexibility, while the conversion to native instructions
++ * allows the post-register-allocation scheduler the best information
++ * possible.
++ */
++void
++fs_visitor::lower_uniform_pull_constant_loads()
++{
++   foreach_list(node, &this->instructions) {
++      fs_inst *inst = (fs_inst *)node;
++
++      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
++         continue;
++
++      if (intel->gen >= 7) {
++         fs_reg const_offset_reg = inst->src[1];
++         assert(const_offset_reg.file == IMM &&
++                const_offset_reg.type == BRW_REGISTER_TYPE_UD);
++         const_offset_reg.imm.u /= 16;
++         fs_reg payload = fs_reg(this, glsl_type::uint_type);
++         struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
++                                    BRW_REGISTER_TYPE_UD);
++
++         fs_inst *setup1 = MOV(payload, fs_reg(g0));
++         setup1->force_writemask_all = true;
++         /* We don't need the second half of this vgrf to be filled with g1
++          * in the 16-wide case, but if we use force_uncompressed then live
++          * variable analysis won't consider this a def!
++          */
++
++         fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
++                                                payload, payload,
++                                                const_offset_reg);
++
++         setup1->ir = inst->ir;
++         setup1->annotation = inst->annotation;
++         inst->insert_before(setup1);
++         setup2->ir = inst->ir;
++         setup2->annotation = inst->annotation;
++         inst->insert_before(setup2);
++         inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
++         inst->src[1] = payload;
++      } else {
++         /* Before register allocation, we didn't tell the scheduler about the
++          * MRF we use.  We know it's safe to use this MRF because nothing
++          * else does except for register spill/unspill, which generates and
++          * uses its MRF within a single IR instruction.
++          */
++         inst->base_mrf = 14;
++         inst->mlen = 1;
++      }
++   }
++}
++
+ void
+ fs_visitor::dump_instruction(fs_inst *inst)
+ {
+@@ -2500,6 +2778,8 @@ fs_visitor::run()
+ 
+       schedule_instructions(false);
+ 
++      lower_uniform_pull_constant_loads();
++
+       assign_curb_setup();
+       assign_urb_setup();
+ 
+@@ -2522,6 +2802,12 @@ fs_visitor::run()
+    assert(force_uncompressed_stack == 0);
+    assert(force_sechalf_stack == 0);
+ 
++   /* This must come after all optimization and register allocation, since
++    * it inserts dead code that happens to have side effects, and it does
++    * so based on the actual physical registers in use.
++    */
++   insert_gen4_send_dependency_workarounds();
++
+    if (failed)
+       return false;
+ 
+diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
+index 88fecb9..d1bb111 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs.h
++++ b/src/mesa/drivers/dri/i965/brw_fs.h
+@@ -285,6 +285,7 @@ public:
+    fs_inst *IF(fs_reg src0, fs_reg src1, uint32_t condition);
+    fs_inst *CMP(fs_reg dst, fs_reg src0, fs_reg src1,
+                 uint32_t condition);
++   fs_inst *DEP_RESOLVE_MOV(int grf);
+ 
+    int type_size(const struct glsl_type *type);
+    fs_inst *get_instruction_generating_reg(fs_inst *start,
+@@ -329,7 +330,11 @@ public:
+    bool remove_duplicate_mrf_writes();
+    bool virtual_grf_interferes(int a, int b);
+    void schedule_instructions(bool post_reg_alloc);
++   void insert_gen4_send_dependency_workarounds();
++   void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst);
++   void insert_gen4_post_send_dependency_workarounds(fs_inst *inst);
+    void fail(const char *msg, ...);
++   void lower_uniform_pull_constant_loads();
+ 
+    void push_force_uncompressed();
+    void pop_force_uncompressed();
+diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+index c4ec1d9..194ed07 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+@@ -223,7 +223,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
+    inst->src[arg].file = entry->src.file;
+    inst->src[arg].reg = entry->src.reg;
+    inst->src[arg].reg_offset = entry->src.reg_offset;
+-   inst->src[arg].smear = entry->src.smear;
++   if (entry->src.smear != -1)
++      inst->src[arg].smear = entry->src.smear;
+ 
+    if (!inst->src[arg].abs) {
+       inst->src[arg].abs = entry->src.abs;
+diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+index 70c143a..a13ca36 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+@@ -105,7 +105,8 @@ fs_visitor::opt_cse_local(bblock_t *block, exec_list *aeb)
+ 	    /* Match current instruction's expression against those in AEB. */
+ 	    if (inst->opcode == entry->generator->opcode &&
+ 		inst->saturate == entry->generator->saturate &&
+-		operands_match(entry->generator->src, inst->src)) {
++                inst->dst.type == entry->generator->dst.type &&
++                operands_match(entry->generator->src, inst->src)) {
+ 
+ 	       found = true;
+ 	       progress = true;
+diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+index 45072da..365a2ec 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+@@ -604,29 +604,8 @@ fs_generator::generate_unspill(fs_inst *inst, struct brw_reg dst)
+ {
+    assert(inst->mlen != 0);
+ 
+-   /* Clear any post destination dependencies that would be ignored by
+-    * the block read.  See the B-Spec for pre-gen5 send instruction.
+-    *
+-    * This could use a better solution, since texture sampling and
+-    * math reads could potentially run into it as well -- anywhere
+-    * that we have a SEND with a destination that is a register that
+-    * was written but not read within the last N instructions (what's
+-    * N?  unsure).  This is rare because of dead code elimination, but
+-    * not impossible.
+-    */
+-   if (intel->gen == 4 && !intel->is_g4x)
+-      brw_MOV(p, brw_null_reg(), dst);
+-
+    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
+ 				inst->offset);
+-
+-   if (intel->gen == 4 && !intel->is_g4x) {
+-      /* gen4 errata: destination from a send can't be used as a
+-       * destination until it's been read.  Just read it so we don't
+-       * have to worry.
+-       */
+-      brw_MOV(p, brw_null_reg(), dst);
+-   }
+ }
+ 
+ void
+@@ -637,19 +616,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
+ {
+    assert(inst->mlen != 0);
+ 
+-   /* Clear any post destination dependencies that would be ignored by
+-    * the block read.  See the B-Spec for pre-gen5 send instruction.
+-    *
+-    * This could use a better solution, since texture sampling and
+-    * math reads could potentially run into it as well -- anywhere
+-    * that we have a SEND with a destination that is a register that
+-    * was written but not read within the last N instructions (what's
+-    * N?  unsure).  This is rare because of dead code elimination, but
+-    * not impossible.
+-    */
+-   if (intel->gen == 4 && !intel->is_g4x)
+-      brw_MOV(p, brw_null_reg(), dst);
+-
+    assert(index.file == BRW_IMMEDIATE_VALUE &&
+ 	  index.type == BRW_REGISTER_TYPE_UD);
+    uint32_t surf_index = index.dw1.ud;
+@@ -660,14 +626,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
+ 
+    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
+ 			read_offset, surf_index);
+-
+-   if (intel->gen == 4 && !intel->is_g4x) {
+-      /* gen4 errata: destination from a send can't be used as a
+-       * destination until it's been read.  Just read it so we don't
+-       * have to worry.
+-       */
+-      brw_MOV(p, brw_null_reg(), dst);
+-   }
+ }
+ 
+ void
+diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+index d4f6fc9..573921c 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+@@ -597,31 +597,9 @@ fs_visitor::visit(ir_expression *ir)
+          fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
+          packed_consts.type = result.type;
+ 
+-         if (intel->gen >= 7) {
+-            fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] / 16);
+-            fs_reg payload = fs_reg(this, glsl_type::uint_type);
+-            struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
+-                                       BRW_REGISTER_TYPE_UD);
+-            fs_inst *setup = emit(MOV(payload, fs_reg(g0)));
+-            setup->force_writemask_all = true;
+-            /* We don't need the second half of this vgrf to be filled with g1
+-             * in the 16-wide case, but if we use force_uncompressed then live
+-             * variable analysis won't consider this a def!
+-             */
+-
+-            emit(FS_OPCODE_SET_GLOBAL_OFFSET, payload,
+-                 payload, const_offset_reg);
+-            emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7, packed_consts,
+-                 surf_index, payload);
+-         } else {
+-            fs_reg const_offset_reg = fs_reg(const_offset->value.u[0]);
+-            fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+-                                         packed_consts,
+-                                         surf_index,
+-                                         const_offset_reg));
+-            pull->base_mrf = 14;
+-            pull->mlen = 1;
+-         }
++         fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
++         emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
++                      packed_consts, surf_index, const_offset_reg));
+ 
+          packed_consts.smear = const_offset->value.u[0] % 16 / 4;
+          for (int i = 0; i < ir->type->vector_elements; i++) {
+diff --git a/src/mesa/drivers/dri/i965/brw_vs_constval.c b/src/mesa/drivers/dri/i965/brw_vs_constval.c
+index 3d53843..48635c5 100644
+--- a/src/mesa/drivers/dri/i965/brw_vs_constval.c
++++ b/src/mesa/drivers/dri/i965/brw_vs_constval.c
+@@ -238,6 +238,23 @@ static void calc_wm_input_sizes( struct brw_context *brw )
+ 
+    calc_sizes(&t);
+ 
++   /* _NEW_POINT
++    *
++    * If the SF will be replacing the vertex output with a reference to
++    * gl_PointCoord, then tell the fragment shader that the value actually
++    * does vary.
++    */
++   if (ctx->Point.PointSprite) {
++      for (int i = 0; i < 8; i++) {
++         if (ctx->Point.CoordReplace[i]) {
++            t.size_masks[4-1] |= FRAG_BIT_TEX(i);
++            t.size_masks[3-1] |= FRAG_BIT_TEX(i);
++            t.size_masks[2-1] |= FRAG_BIT_TEX(i);
++            t.size_masks[1-1] |= FRAG_BIT_TEX(i);
++         }
++      }
++   }
++
+    if (memcmp(brw->wm.input_size_masks, t.size_masks, sizeof(t.size_masks)) != 0) {
+       memcpy(brw->wm.input_size_masks, t.size_masks, sizeof(t.size_masks));
+       brw->state.dirty.brw |= BRW_NEW_WM_INPUT_DIMENSIONS;
+@@ -246,7 +263,7 @@ static void calc_wm_input_sizes( struct brw_context *brw )
+ 
+ const struct brw_tracked_state brw_wm_input_sizes = {
+    .dirty = {
+-      .mesa  = _NEW_LIGHT | _NEW_PROGRAM,
++      .mesa  = _NEW_LIGHT | _NEW_PROGRAM | _NEW_POINT,
+       .brw   = BRW_NEW_VERTEX_PROGRAM | BRW_NEW_INPUT_DIMENSIONS,
+       .cache = 0
+    },
+diff --git a/src/mesa/drivers/dri/intel/intel_chipset.h b/src/mesa/drivers/dri/intel/intel_chipset.h
+index 9c00ba8..885f6c2 100644
+--- a/src/mesa/drivers/dri/intel/intel_chipset.h
++++ b/src/mesa/drivers/dri/intel/intel_chipset.h
+@@ -114,15 +114,15 @@
+ #define PCI_CHIP_HASWELL_ULT_S_GT1      0x0A0A /* Server */
+ #define PCI_CHIP_HASWELL_ULT_S_GT2      0x0A1A
+ #define PCI_CHIP_HASWELL_ULT_S_GT2_PLUS 0x0A2A
+-#define PCI_CHIP_HASWELL_CRW_GT1        0x0D12 /* Desktop */
+-#define PCI_CHIP_HASWELL_CRW_GT2        0x0D22
+-#define PCI_CHIP_HASWELL_CRW_GT2_PLUS   0x0D32
+-#define PCI_CHIP_HASWELL_CRW_M_GT1      0x0D16 /* Mobile */
+-#define PCI_CHIP_HASWELL_CRW_M_GT2      0x0D26
+-#define PCI_CHIP_HASWELL_CRW_M_GT2_PLUS 0x0D36
+-#define PCI_CHIP_HASWELL_CRW_S_GT1      0x0D1A /* Server */
+-#define PCI_CHIP_HASWELL_CRW_S_GT2      0x0D2A
+-#define PCI_CHIP_HASWELL_CRW_S_GT2_PLUS 0x0D3A
++#define PCI_CHIP_HASWELL_CRW_GT1        0x0D02 /* Desktop */
++#define PCI_CHIP_HASWELL_CRW_GT2        0x0D12
++#define PCI_CHIP_HASWELL_CRW_GT2_PLUS   0x0D22
++#define PCI_CHIP_HASWELL_CRW_M_GT1      0x0D06 /* Mobile */
++#define PCI_CHIP_HASWELL_CRW_M_GT2      0x0D16
++#define PCI_CHIP_HASWELL_CRW_M_GT2_PLUS 0x0D26
++#define PCI_CHIP_HASWELL_CRW_S_GT1      0x0D0A /* Server */
++#define PCI_CHIP_HASWELL_CRW_S_GT2      0x0D1A
++#define PCI_CHIP_HASWELL_CRW_S_GT2_PLUS 0x0D2A
+ 
+ #define IS_MOBILE(devid)	(devid == PCI_CHIP_I855_GM || \
+ 				 devid == PCI_CHIP_I915_GM || \
+diff --git a/src/mesa/main/attrib.c b/src/mesa/main/attrib.c
+index a951283..6d91534 100644
+--- a/src/mesa/main/attrib.c
++++ b/src/mesa/main/attrib.c
+@@ -130,6 +130,9 @@ struct gl_enable_attrib
+    GLboolean VertexProgramPointSize;
+    GLboolean VertexProgramTwoSide;
+ 
++   /* GL_ARB_fragment_program */
++   GLboolean FragmentProgram;
++
+    /* GL_ARB_point_sprite / GL_NV_point_sprite */
+    GLboolean PointSprite;
+    GLboolean FragmentShaderATI;
+@@ -316,6 +319,10 @@ _mesa_PushAttrib(GLbitfield mask)
+       attr->VertexProgram = ctx->VertexProgram.Enabled;
+       attr->VertexProgramPointSize = ctx->VertexProgram.PointSizeEnabled;
+       attr->VertexProgramTwoSide = ctx->VertexProgram.TwoSideEnabled;
++
++      /* GL_ARB_fragment_program */
++      attr->FragmentProgram = ctx->FragmentProgram.Enabled;
++
+       save_attrib_data(&head, GL_ENABLE_BIT, attr);
+ 
+       /* GL_ARB_framebuffer_sRGB / GL_EXT_framebuffer_sRGB */
+@@ -607,6 +614,11 @@ pop_enable_group(struct gl_context *ctx, const struct gl_enable_attrib *enable)
+                    enable->VertexProgramTwoSide,
+                    GL_VERTEX_PROGRAM_TWO_SIDE_ARB);
+ 
++   /* GL_ARB_fragment_program */
++   TEST_AND_UPDATE(ctx->FragmentProgram.Enabled,
++                   enable->FragmentProgram,
++                   GL_FRAGMENT_PROGRAM_ARB);
++
+    /* GL_ARB_framebuffer_sRGB / GL_EXT_framebuffer_sRGB */
+    TEST_AND_UPDATE(ctx->Color.sRGBEnabled, enable->sRGBEnabled,
+                    GL_FRAMEBUFFER_SRGB);
+diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
+index 5e9e539..df57b76 100644
+--- a/src/mesa/main/context.c
++++ b/src/mesa/main/context.c
+@@ -1072,7 +1072,6 @@ _mesa_initialize_context(struct gl_context *ctx,
+    case API_OPENGLES2:
+       ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
+       ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
+-      ctx->Point.PointSprite = GL_TRUE;  /* always on for ES 2.x */
+       break;
+    }
+ 
+diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
+index 8728540..c1e1658 100644
+--- a/src/mesa/main/glformats.c
++++ b/src/mesa/main/glformats.c
+@@ -917,7 +917,7 @@ _mesa_is_compressed_format(struct gl_context *ctx, GLenum format)
+    case GL_COMPRESSED_SIGNED_RG11_EAC:
+    case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+    case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+-      return _mesa_is_gles3(ctx);
++      return _mesa_is_gles3(ctx) || ctx->Extensions.ARB_ES3_compatibility;
+    case GL_PALETTE4_RGB8_OES:
+    case GL_PALETTE4_RGBA8_OES:
+    case GL_PALETTE4_R5_G6_B5_OES:
+diff --git a/src/mesa/main/points.c b/src/mesa/main/points.c
+index 1778640..c925d4c 100644
+--- a/src/mesa/main/points.c
++++ b/src/mesa/main/points.c
+@@ -253,7 +253,8 @@ _mesa_init_point(struct gl_context *ctx)
+     * In a core context, the state will default to true, and the setters and
+     * getters are disabled.
+     */
+-   ctx->Point.PointSprite = (ctx->API == API_OPENGL_CORE);
++   ctx->Point.PointSprite = (ctx->API == API_OPENGL_CORE ||
++                             ctx->API == API_OPENGLES2);
+ 
+    ctx->Point.SpriteRMode = GL_ZERO; /* GL_NV_point_sprite (only!) */
+    ctx->Point.SpriteOrigin = GL_UPPER_LEFT; /* GL_ARB_point_sprite */
+diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
+index d1723b8..1b9525b 100644
+--- a/src/mesa/main/teximage.c
++++ b/src/mesa/main/teximage.c
+@@ -520,7 +520,7 @@ _mesa_base_tex_format( struct gl_context *ctx, GLint internalFormat )
+       }
+    }
+ 
+-   if (_mesa_is_gles3(ctx)) {
++   if (_mesa_is_gles3(ctx) || ctx->Extensions.ARB_ES3_compatibility) {
+       switch (internalFormat) {
+       case GL_COMPRESSED_RGB8_ETC2:
+       case GL_COMPRESSED_SRGB8_ETC2:
+@@ -3187,6 +3187,12 @@ _mesa_EGLImageTargetTexture2DOES (GLenum target, GLeglImageOES image)
+       return;
+    }
+ 
++   if (!image) {
++      _mesa_error(ctx, GL_INVALID_OPERATION,
++		  "glEGLImageTargetTexture2D(image=%p)", image);
++      return;
++   }
++
+    if (ctx->NewState & _NEW_PIXEL)
+       _mesa_update_state(ctx);
+ 
+diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
+index 52ede13..6f18ec6 100644
+--- a/src/mesa/main/texparam.c
++++ b/src/mesa/main/texparam.c
+@@ -1432,6 +1432,12 @@ _mesa_GetTexParameterfv( GLenum target, GLenum pname, GLfloat *params )
+          *params = (GLfloat) obj->Immutable;
+          break;
+ 
++      case GL_REQUIRED_TEXTURE_IMAGE_UNITS_OES:
++         if (!_mesa_is_gles(ctx) || !ctx->Extensions.OES_EGL_image_external)
++            goto invalid_pname;
++         *params = obj->RequiredTextureImageUnits;
++         break;
++
+       case GL_TEXTURE_SRGB_DECODE_EXT:
+          if (!ctx->Extensions.EXT_texture_sRGB_decode)
+             goto invalid_pname;
+diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
+index f20df9e..7fdfa72 100644
+--- a/src/mesa/state_tracker/st_atom_rasterizer.c
++++ b/src/mesa/state_tracker/st_atom_rasterizer.c
+@@ -135,16 +135,12 @@ static void update_raster_state( struct st_context *st )
+ 
+    /* _NEW_POLYGON 
+     */
+-   if (ctx->Polygon.OffsetUnits != 0.0 ||
+-       ctx->Polygon.OffsetFactor != 0.0) {
+-      raster->offset_point = ctx->Polygon.OffsetPoint;
+-      raster->offset_line = ctx->Polygon.OffsetLine;
+-      raster->offset_tri = ctx->Polygon.OffsetFill;
+-   }
+-
+    if (ctx->Polygon.OffsetPoint ||
+        ctx->Polygon.OffsetLine ||
+        ctx->Polygon.OffsetFill) {
++      raster->offset_point = ctx->Polygon.OffsetPoint;
++      raster->offset_line = ctx->Polygon.OffsetLine;
++      raster->offset_tri = ctx->Polygon.OffsetFill;
+       raster->offset_units = ctx->Polygon.OffsetUnits;
+       raster->offset_scale = ctx->Polygon.OffsetFactor;
+    }
+diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
+index 63dbdb2..36fffe9 100644
+--- a/src/mesa/state_tracker/st_cb_bitmap.c
++++ b/src/mesa/state_tracker/st_cb_bitmap.c
+@@ -675,11 +675,12 @@ st_flush_bitmap_cache(struct st_context *st)
+  * \return  GL_TRUE for success, GL_FALSE if bitmap is too large, etc.
+  */
+ static GLboolean
+-accum_bitmap(struct st_context *st,
++accum_bitmap(struct gl_context *ctx,
+              GLint x, GLint y, GLsizei width, GLsizei height,
+              const struct gl_pixelstore_attrib *unpack,
+              const GLubyte *bitmap )
+ {
++   struct st_context *st = ctx->st;
+    struct bitmap_cache *cache = st->bitmap.cache;
+    int px = -999, py = -999;
+    const GLfloat z = st->ctx->Current.RasterPos[2];
+@@ -729,9 +730,17 @@ accum_bitmap(struct st_context *st,
+    /* create the transfer if needed */
+    create_cache_trans(st);
+ 
++   /* PBO source... */
++   bitmap = _mesa_map_pbo_source(ctx, unpack, bitmap);
++   if (!bitmap) {
++      return FALSE;
++   }
++
+    unpack_bitmap(st, px, py, width, height, unpack, bitmap,
+                  cache->buffer, BITMAP_CACHE_WIDTH);
+ 
++   _mesa_unmap_pbo_source(ctx, unpack);
++
+    return GL_TRUE; /* accumulated */
+ }
+ 
+@@ -764,7 +773,7 @@ st_Bitmap(struct gl_context *ctx, GLint x, GLint y,
+                                                           semantic_indexes);
+    }
+ 
+-   if (UseBitmapCache && accum_bitmap(st, x, y, width, height, unpack, bitmap))
++   if (UseBitmapCache && accum_bitmap(ctx, x, y, width, height, unpack, bitmap))
+       return;
+ 
+    pt = make_bitmap_texture(ctx, width, height, unpack, bitmap);
+diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
+index de62264..bff8d9b 100644
+--- a/src/mesa/state_tracker/st_draw.c
++++ b/src/mesa/state_tracker/st_draw.c
+@@ -283,7 +283,7 @@ st_draw_vbo(struct gl_context *ctx,
+          /* don't trim, restarts might be inside index list */
+          cso_draw_vbo(st->cso_context, &info);
+       }
+-      else if (u_trim_pipe_prim(info.mode, &info.count))
++      else if (u_trim_pipe_prim(prims[i].mode, &info.count))
+          cso_draw_vbo(st->cso_context, &info);
+    }
+ 
+diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
+index a9111b5..f56f7cb 100644
+--- a/src/mesa/state_tracker/st_program.c
++++ b/src/mesa/state_tracker/st_program.c
+@@ -1142,7 +1142,7 @@ st_print_shaders(struct gl_context *ctx)
+ static void
+ destroy_program_variants(struct st_context *st, struct gl_program *program)
+ {
+-   if (!program)
++   if (!program || program == &_mesa_DummyProgram)
+       return;
+ 
+    switch (program->Target) {
diff --git a/mesa.spec b/mesa.spec
index bcfa976..f360bdf 100644
--- a/mesa.spec
+++ b/mesa.spec
@@ -1,5 +1,4 @@
 %if 0%{?rhel}
-%define rhel_no_hw_arches ppc ppc64 ppc64p7
 %define with_private_llvm 1
 %else
 %define with_private_llvm 0
@@ -15,7 +14,7 @@
 %endif
 
 # S390 doesn't have video cards, but we need swrast for xserver's GLX
-%ifarch s390 s390x  %{?rhel_no_hw_arches}
+%ifarch s390 s390x
 %define with_hardware 0
 %define dri_drivers --with-dri-drivers=swrast
 %else
@@ -49,7 +48,7 @@
 Summary: Mesa graphics libraries
 Name: mesa
 Version: 9.1
-Release: 2%{?dist}
+Release: 3%{?dist}
 License: MIT
 Group: System Environment/Libraries
 URL: http://www.mesa3d.org
@@ -64,16 +63,18 @@ Source3: make-git-snapshot.sh
 # Fedora opts to ignore the optional part of clause 2 and treat that code as 2 clause BSD.
 Source4: Mesa-MLAA-License-Clarification-Email.txt
 
-# -fno-rtti makes nv50 assert angry
-Patch0: nv50-fix-build.patch
-Patch1: intel-revert-gl3.patch
+# git diff-tree -p mesa-9.1..origin/9.1 > `git describe origin/9.1`.patch
+Patch0: mesa-9.1-53-gd0ccb5b.patch
+
+Patch1: nv50-fix-build.patch
+Patch2: intel-revert-gl3.patch
 #Patch7: mesa-7.1-link-shared.patch
 Patch9: mesa-8.0-llvmpipe-shmget.patch
 #Patch11: mesa-8.0-nouveau-tfp-blacklist.patch
 Patch12: mesa-8.0.1-fix-16bpp.patch
-#Patch13: mesa-9.0.1-less-cxx-please.patch
 Patch14: i965-hack-hiz-snb-fix.patch
 
+
 BuildRequires: pkgconfig autoconf automake libtool
 %if %{with_hardware}
 BuildRequires: kernel-headers
@@ -282,8 +283,9 @@ Mesa shared glapi
 %prep
 %setup -q -n Mesa-%{version}%{?snapshot}
 #setup -q -n mesa-%{gitdate}
-%patch0 -p1 -b .nv50rtti
-%patch1 -p1 -b .nogl3
+%patch0 -p1 -b .git
+%patch1 -p1 -b .nv50rtti
+%patch2 -p1 -b .nogl3
 #%patch11 -p1 -b .nouveau
 
 # this fastpath is:
@@ -297,8 +299,6 @@ Mesa shared glapi
 #patch9 -p1 -b .shmget
 #patch12 -p1 -b .16bpp
 
-#%patch13 -p1 -b .less-cpp
-
 # hack from chromium - awaiting real upstream fix
 %patch14 -p1 -b .snbfix
 # default to dri (not xlib) for libGL on all arches
@@ -592,6 +592,9 @@ rm -rf $RPM_BUILD_ROOT
 %endif
 
 %changelog
+* Tue Mar 19 2013 Adam Jackson <ajax@redhat.com> 9.1-3
+- mesa-9.1-53-gd0ccb5b.patch: Sync with today's git
+
 * Tue Mar 19 2013 Dave Airlie <airlied@redhat.com> 9.1-2
 - add SNB hang workaround from chromium