[Rawstudio-commit] r4134 - trunk/plugins/dcp

Klaus Post klauspost at gmail.com
Thu Feb 16 19:07:44 CET 2012


Author: post
Date: 2012-02-16 19:07:44 +0100 (Thu, 16 Feb 2012)
New Revision: 4134

Modified:
   trunk/plugins/dcp/dcp-avx.c
   trunk/plugins/dcp/dcp.c
Log:
Use SSE4 functions for AVX, re-enable, but use SSE2 tonemapping function, since GCC 4.6.1 seem to miscompile AVX for this function.

Modified: trunk/plugins/dcp/dcp-avx.c
===================================================================
--- trunk/plugins/dcp/dcp-avx.c	2012-02-13 17:08:23 UTC (rev 4133)
+++ trunk/plugins/dcp/dcp-avx.c	2012-02-16 18:07:44 UTC (rev 4134)
@@ -21,7 +21,7 @@
 
 #ifdef __AVX__
 
-#include <emmintrin.h>
+#include <smmintrin.h>
 #include <math.h> /* powf() */
 
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
@@ -38,26 +38,17 @@
 static gfloat _six_ps[4] __attribute__ ((aligned (16))) = {6.0f-1e-15, 6.0f-1e-15, 6.0f-1e-15, 6.0f-1e-15};
 static gfloat _very_small_ps[4] __attribute__ ((aligned (16))) = {1e-15, 1e-15, 1e-15, 1e-15};
 static const gfloat _two_to_23_ps[4] __attribute__ ((aligned (16))) = { 0x1.0p23f, 0x1.0p23f, 0x1.0p23f, 0x1.0p23f };
-static guint _ps_mask_sign[4] __attribute__ ((aligned (16))) = {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
 
 #define DW(A) _mm_castps_si128(A)
 #define PS(A) _mm_castsi128_ps(A)
 
-/* Floor for positive numbers */
-static inline __m128 _mm_floor_positive_ps( __m128 v )
-{
-	__m128 two_to_23_ps = _mm_load_ps(_two_to_23_ps);
-	return _mm_sub_ps( _mm_add_ps( v, two_to_23_ps ), two_to_23_ps );
-}
 
 static inline void
-RGBtoHSV_AVX(__m128 *c0, __m128 *c1, __m128 *c2)
+RGBtoHSV_SSE4(__m128 *c0, __m128 *c1, __m128 *c2)
 {
-
-	__m128i zero_i = _mm_setzero_si128();
+	__m128 zero_ps = _mm_setzero_ps();
 	__m128 small_ps = _mm_load_ps(_very_small_ps);
 	__m128 ones_ps = _mm_load_ps(_ones_ps);
-	__m128i ps_mask_sign = _mm_load_si128((__m128i*)_ps_mask_sign);
 	
 	// Any number > 1
 	__m128 add_v = _mm_load_ps(_two_ps);
@@ -71,12 +62,16 @@
 	g =  _mm_min_ps(_mm_max_ps(g, small_ps),ones_ps);
 	b =  _mm_min_ps(_mm_max_ps(b, small_ps),ones_ps);
 
-	__m128 v = _mm_max_ps(b,_mm_max_ps(r,g));
+	__m128 h, v;
+	v = _mm_max_ps(b,_mm_max_ps(r,g));
+
 	__m128 m = _mm_min_ps(b,_mm_min_ps(r,g));
 	__m128 gap = _mm_sub_ps(v,m);
-	__m128 v_mask = PS(_mm_cmpeq_epi32(_mm_and_si128(DW(gap), ps_mask_sign), zero_i));
+	__m128 v_mask = _mm_cmpeq_ps(gap, zero_ps);
 	v = _mm_add_ps(v, _mm_and_ps(add_v, v_mask));
 
+	h = _mm_setzero_ps();
+
 	/* Set gap to one where sat = 0, this will avoid divisions by zero, these values will not be used */
 	ones_ps = _mm_and_ps(ones_ps, v_mask);
 	gap = _mm_or_ps(gap, ones_ps);
@@ -85,32 +80,32 @@
 
 	/* if r == v */
 	/* h = (g - b) / gap; */
-	__m128i mask = _mm_cmpeq_epi32(DW(r), DW(v));
+	__m128 mask = _mm_cmpeq_ps(r, v);
 	__m128 val = _mm_mul_ps(gap_inv, _mm_sub_ps(g, b));
 
 	/* fill h */
-	v = _mm_add_ps(v, _mm_and_ps(add_v, PS(mask)));
-	__m128i h = _mm_and_si128(DW(val), mask);
+	v = _mm_add_ps(v, _mm_and_ps(add_v, mask));
+	h = _mm_blendv_ps(h, val, mask);
 
 	/* if g == v */
 	/* h = 2.0f + (b - r) / gap; */
 	__m128 two_ps = _mm_load_ps(_two_ps);
-	mask = _mm_cmpeq_epi32(DW(g), DW(v));
+	mask = _mm_cmpeq_ps(g, v);
 	val = _mm_sub_ps(b, r);
 	val = _mm_mul_ps(val, gap_inv);
 	val = _mm_add_ps(val, two_ps);
 
-	v = _mm_add_ps(v, _mm_and_ps(add_v, PS(mask)));
-	h = _mm_or_si128(h, _mm_and_si128(DW(val), mask));
+	v = _mm_add_ps(v, _mm_and_ps(add_v, mask));
+	h = _mm_blendv_ps(h, val, mask);
 
 	/* If (b == v) */
 	/* h = 4.0f + (r - g) / gap; */
 	__m128 four_ps = _mm_add_ps(two_ps, two_ps);
-	mask = _mm_cmpeq_epi32(DW(b), DW(v));
+	mask = _mm_cmpeq_ps(b, v);
 	val = _mm_add_ps(four_ps, _mm_mul_ps(gap_inv, _mm_sub_ps(r, g)));
 
-	h = _mm_or_si128(h, _mm_and_si128(DW(val), mask));
-	v = _mm_add_ps(v, _mm_and_ps(add_v, PS(mask)));
+	v = _mm_add_ps(v, _mm_and_ps(add_v, mask));
+	h = _mm_blendv_ps(h, val, mask);
 
 	__m128 s;
 	/* Fill s, if gap > 0 */
@@ -119,20 +114,19 @@
 	s = _mm_andnot_ps(v_mask, val );
 
 	/* Check if h < 0 */
-	zero_i = _mm_setzero_si128();
-	__m128i six_ps_i = _mm_load_si128((__m128i*)_six_ps);
-	/* We can use integer comparision, since we are checking if h < 0, since the sign bit is same in integer */
-	mask = _mm_cmplt_epi32(h, zero_i);
-	__m128 h2 = _mm_add_ps(PS(h), PS(_mm_and_si128(mask, six_ps_i)));
+	zero_ps = _mm_setzero_ps();
+	__m128 six_ps = _mm_load_ps(_six_ps);
+	mask = _mm_cmplt_ps(h, zero_ps);
+	h = _mm_add_ps(h, _mm_and_ps(mask, six_ps));
 
-	*c0 = h2;
+	*c0 = h;
 	*c1 = s;
 	*c2 = v;
 }
 
 
 static inline void
-HSVtoRGB_SSE(__m128 *c0, __m128 *c1, __m128 *c2)
+HSVtoRGB_SSE4(__m128 *c0, __m128 *c1, __m128 *c2)
 {
 	__m128 h = *c0;
 	__m128 s = *c1;
@@ -140,9 +134,9 @@
 	__m128 r, g, b;
 	
 	/* Convert get the fraction of h
-	* h_fraction = h - floor(h) */
+	* h_fraction = h - (float)(int)h */
 	__m128 ones_ps = _mm_load_ps(_ones_ps);
-	__m128 h_fraction = _mm_sub_ps(h,_mm_floor_positive_ps(h));
+	__m128 h_fraction = _mm_sub_ps(h, _mm_floor_ps(h));
 
 	/* p = v * (1.0f - s)  */
 	__m128 p = _mm_mul_ps(v,  _mm_sub_ps(ones_ps, s));
@@ -164,9 +158,9 @@
 	__m128 m = _mm_cmplt_ps(h, h_threshold);
 	h_threshold = _mm_add_ps(h_threshold, ones_ps);
 	m = _mm_andnot_ps(out_mask, m);
-	r = _mm_or_ps(r, _mm_and_ps(q, m));
-	g = _mm_or_ps(g, _mm_and_ps(v, m));
-	b = _mm_or_ps(b, _mm_and_ps(p, m));
+	r = _mm_blendv_ps(r, q, m);
+	g = _mm_blendv_ps(g, v, m);
+	b = _mm_blendv_ps(b, p, m);
 	out_mask = _mm_or_ps(out_mask, m);
 
 	/* h < 3 (case 2)*/
@@ -174,9 +168,9 @@
 	m = _mm_cmplt_ps(h, h_threshold);
 	h_threshold = _mm_add_ps(h_threshold, ones_ps);
 	m = _mm_andnot_ps(out_mask, m);
-	r = _mm_or_ps(r, _mm_and_ps(p, m));
-	g = _mm_or_ps(g, _mm_and_ps(v, m));
-	b = _mm_or_ps(b, _mm_and_ps(t, m));
+	r = _mm_blendv_ps(r, p, m);
+	g = _mm_blendv_ps(g, v, m);
+	b = _mm_blendv_ps(b, t, m);
 	out_mask = _mm_or_ps(out_mask, m);
 
 	/* h < 4 (case 3)*/
@@ -184,9 +178,9 @@
 	m = _mm_cmplt_ps(h, h_threshold);
 	h_threshold = _mm_add_ps(h_threshold, ones_ps);
 	m = _mm_andnot_ps(out_mask, m);
-	r = _mm_or_ps(r, _mm_and_ps(p, m));
-	g = _mm_or_ps(g, _mm_and_ps(q, m));
-	b = _mm_or_ps(b, _mm_and_ps(v, m));
+	r = _mm_blendv_ps(r, p, m);
+	g = _mm_blendv_ps(g, q, m);
+	b = _mm_blendv_ps(b, v, m);
 	out_mask = _mm_or_ps(out_mask, m);
 
 	/* h < 5 (case 4)*/
@@ -201,9 +195,11 @@
 
 	/* Remainder (case 5) */
 	/* case 5: *r = v; *g = p; *b = q; break; */
-	r = _mm_or_ps(r, _mm_andnot_ps(out_mask,v));
-	g = _mm_or_ps(g, _mm_andnot_ps(out_mask,p));
-	b = _mm_or_ps(b, _mm_andnot_ps(out_mask,q));
+	__m128 all_ones = _mm_cmpeq_ps(h,h);
+	m = _mm_xor_ps(out_mask, all_ones);
+	r = _mm_blendv_ps(r, v, m);
+	g = _mm_blendv_ps(g, p, m);
+	b = _mm_blendv_ps(b, q, m);
 	
 	*c0 = r;
 	*c1 = g;
@@ -211,10 +207,10 @@
 }
 
 
+/* GCC 4.6.1 seems to miscompile this function with AVX, so disabled for now */
+#if 0
 static gint _ones_epi32[4] __attribute__ ((aligned (16))) = {1,1,1,1};
 
-/* Until now the same as the SSE2 version, but included here to allow AVX compilation */
-/* to utilize 3-paramater instructions */
 
 void
 huesat_map_AVX(RSHuesatMap *map, const PrecalcHSM* precalc, __m128 *_h, __m128 *_s, __m128 *_v)
@@ -331,7 +327,6 @@
 		__m128 hScaled = _mm_mul_ps(h, _mm_load_ps(precalc->hScale));
 		__m128 sScaled = _mm_mul_ps(s,  _mm_load_ps(precalc->sScale));
 		__m128 vScaled = _mm_mul_ps(v,  _mm_load_ps(precalc->vScale));
-
 		__m128i hIndex0 = _mm_cvttps_epi32(hScaled);
 		__m128i sIndex0 = _mm_cvttps_epi32(sScaled);
 		__m128i vIndex0 = _mm_cvttps_epi32(vScaled);
@@ -480,7 +475,9 @@
 	*_s = s;
 	*_v = v;
 }
+#endif
 
+
 static gfloat _16_bit_ps[4] __attribute__ ((aligned (16))) = {65535.0, 65535.0, 65535.0, 65535.0};
 static gfloat _thousand_24_ps[4] __attribute__ ((aligned (16))) = {1023.99999f, 1023.99999f, 1023.99999f, 1023.99999f};
 
@@ -494,7 +491,7 @@
 	_mm_store_si128((__m128i*)&xfer[0], lookup);
 
 	/* Calculate fractions */
-	__m128 frac = _mm_sub_ps(mul, _mm_floor_positive_ps(mul));
+	__m128 frac = _mm_sub_ps(mul, _mm_floor_ps(mul));
 	__m128 inv_frac = _mm_sub_ps(_mm_load_ps(_ones_ps), frac);
 
 	/* Load two adjacent curve values and interpolate between them */
@@ -614,6 +611,8 @@
 #define SETFLOAT4_SAME(N, A) float N[4] __attribute__ ((aligned (16))); \
 N[0] = A; N[1] = A; N[2] = A; N[3] = A;
 
+extern void huesat_map_SSE2(RSHuesatMap *map, const PrecalcHSM* precalc, __m128 *_h, __m128 *_s, __m128 *_v);
+
 gboolean
 render_AVX(ThreadInfo* t)
 {
@@ -728,12 +727,12 @@
 			g2 = sse_matrix3_mul(&cam_prof[12], r, g, b);
 			b2 = sse_matrix3_mul(&cam_prof[24], r, g, b);
 
-			RGBtoHSV_AVX(&r2, &g2, &b2);
+			RGBtoHSV_SSE4(&r2, &g2, &b2);
 			h = r2; s = g2; v = b2;
 
 			if (dcp->huesatmap)
 			{
-				huesat_map_AVX(dcp->huesatmap, dcp->huesatmap_precalc, &h, &s, &v);
+				huesat_map_SSE2(dcp->huesatmap, dcp->huesatmap_precalc, &h, &s, &v);
 			}
 
 			/* Saturation */
@@ -766,7 +765,7 @@
 			h = _mm_add_ps(h, six_masked_lt);
 			__m128 v_stored = v;
 
-			HSVtoRGB_SSE(&h, &s, &v);
+			HSVtoRGB_SSE4(&h, &s, &v);
 			r = h; g = s; b = v;
 			
 			/* Exposure */
@@ -849,7 +848,7 @@
 			}
 
 			/* Convert to HSV */
-			RGBtoHSV_AVX(&r, &g, &b);
+			RGBtoHSV_SSE4(&r, &g, &b);
 			h = r; s = g; v = b;
 
 			if (!dcp->curve_is_flat)			
@@ -860,7 +859,7 @@
 				_mm_store_si128((__m128i*)&xfer[0], lookup);
 
 				/* Calculate fractions */
-				__m128 frac = _mm_sub_ps(v_mul, _mm_floor_positive_ps(v_mul));
+				__m128 frac = _mm_sub_ps(v_mul, _mm_floor_ps(v_mul));
 				__m128 inv_frac = _mm_sub_ps(_mm_load_ps(_ones_ps), frac);
 				
 				/* Load two adjacent curve values and interpolate between them */
@@ -877,7 +876,7 @@
 
 			/* Apply looktable */
 			if (dcp->looktable) {
-				huesat_map_AVX(dcp->looktable, dcp->looktable_precalc, &h, &s, &v);
+				huesat_map_SSE2(dcp->looktable, dcp->looktable_precalc, &h, &s, &v);
 			}
 			
 			/* Ensure that hue is within range */
@@ -892,7 +891,7 @@
 			/* s always slightly > 0 when converting to RGB */
 			s = _mm_max_ps(s, min_val);
 
-			HSVtoRGB_SSE(&h, &s, &v);
+			HSVtoRGB_SSE4(&h, &s, &v);
 			r = h; g = s; b = v;
 
 			/* Apply Tone Curve  in RGB space*/

Modified: trunk/plugins/dcp/dcp.c
===================================================================
--- trunk/plugins/dcp/dcp.c	2012-02-13 17:08:23 UTC (rev 4133)
+++ trunk/plugins/dcp/dcp.c	2012-02-16 18:07:44 UTC (rev 4134)
@@ -471,7 +471,7 @@
 	pre_cache_tables(t->dcp);
 	if (tmp->pixelsize == 4  && (rs_detect_cpu_features() & RS_CPU_FLAG_SSE2) && !t->dcp->read_out_curve)
 	{
-		if (FALSE && (rs_detect_cpu_features() & RS_CPU_FLAG_AVX) && render_AVX(t))
+		if ((rs_detect_cpu_features() & RS_CPU_FLAG_AVX) && render_AVX(t))
 		{
 			/* AVX routine renders 4 pixels in parallel, but any remaining must be */
 			/* calculated using C routines */




More information about the Rawstudio-commit mailing list