[Rawstudio-commit] r3057 - branches/rawstudio-ng-color/plugins/colorspace-transform

Klaus Post klauspost at gmail.com
Sat Jan 23 14:57:11 CET 2010


Author: post
Date: 2010-01-23 14:57:11 +0100 (Sat, 23 Jan 2010)
New Revision: 3057

Added:
   branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform_sse2.c
Modified:
   branches/rawstudio-ng-color/plugins/colorspace-transform/Makefile.am
   branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform.c
Log:
Colorspace transform. Add multithreading, ROI and SSE2 code to all internal 16 bit -> 8 bit conversions. Note that the GdkPixbuffer now has an alpha channel, which might all to other code changes needed.

Modified: branches/rawstudio-ng-color/plugins/colorspace-transform/Makefile.am
===================================================================
--- branches/rawstudio-ng-color/plugins/colorspace-transform/Makefile.am	2010-01-23 12:55:16 UTC (rev 3056)
+++ branches/rawstudio-ng-color/plugins/colorspace-transform/Makefile.am	2010-01-23 13:57:11 UTC (rev 3057)
@@ -17,6 +17,19 @@
 
 libdir = $(datadir)/rawstudio/plugins/
 
-colorspace_transform_la_LIBADD = @PACKAGE_LIBS@
+colorspace_transform_la_LIBADD = @PACKAGE_LIBS@ colorspace_transform_sse2.lo rs-cmm.lo
 colorspace_transform_la_LDFLAGS = -module -avoid-version
-colorspace_transform_la_SOURCES = colorspace_transform.c rs-cmm.c rs-cmm.h
+
+colorspace_transform.lo: colorspace_transform.c
+	$(LTCOMPILE) -DEXIT_CODE=0 -c colorspace_transform.c
+
+rs-cmm.lo: rs-cmm.c rs-cmm.h
+	$(LTCOMPILE) -DEXIT_CODE=2 -c rs-cmm.c
+
+colorspace_transform_sse2.lo: colorspace_transform_sse2.c
+if CAN_COMPILE_SSE2
+SSE_FLAG=-msse2
+else
+SSE_FLAG=
+endif
+	$(LTCOMPILE) $(SSE_FLAG) -DEXIT_CODE=1 -c colorspace_transform_sse2.c

Modified: branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform.c
===================================================================
--- branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform.c	2010-01-23 12:55:16 UTC (rev 3056)
+++ branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform.c	2010-01-23 13:57:11 UTC (rev 3057)
@@ -31,6 +31,23 @@
 typedef struct _RSColorspaceTransform RSColorspaceTransform;
 typedef struct _RSColorspaceTransformClass RSColorspaceTransformClass;
 
+typedef struct {
+	RSColorspaceTransform *cst;
+	GThread *threadid;
+	gint start_x;
+	gint start_y;
+	gint end_x;
+	gint end_y;
+	RS_IMAGE16 *input;
+	void *output;
+	RSColorSpace *input_space;
+	RSColorSpace *output_space;
+	RS_MATRIX3 *matrix;
+	gboolean gamma_correct;
+	guchar* table8;
+	gfloat output_gamma;
+} ThreadInfo;
+
 struct _RSColorspaceTransform {
 	RSFilter parent;
 
@@ -51,10 +68,15 @@
 static RSFilterResponse *get_image(RSFilter *filter, const RSFilterRequest *request);
 static RSFilterResponse *get_image8(RSFilter *filter, const RSFilterRequest *request);
 static gboolean convert_colorspace16(RSColorspaceTransform *colorspace_transform, RS_IMAGE16 *input_image, RS_IMAGE16 *output_image, RSColorSpace *input_space, RSColorSpace *output_space);
-static void convert_colorspace8(RSColorspaceTransform *colorspace_transform, RS_IMAGE16 *input_image, GdkPixbuf *output_image, RSColorSpace *input_space, RSColorSpace *output_space);
+static void convert_colorspace8(RSColorspaceTransform *colorspace_transform, RS_IMAGE16 *input_image, GdkPixbuf *output_image, RSColorSpace *input_space, RSColorSpace *output_space, GdkRectangle *roi);
 
 static RSFilterClass *rs_colorspace_transform_parent_class = NULL;
 
+/* SSE2 optimized functions */
+extern void transform8_srgb_sse2(ThreadInfo* t);
+extern void transform8_otherrgb_sse2(ThreadInfo* t);
+extern gboolean cst_has_sse2();
+
 G_MODULE_EXPORT void
 rs_plugin_load(RSPlugin *plugin)
 {
@@ -106,8 +128,6 @@
 		if (convert_colorspace16(colorspace_transform, input, output, input_space, output_space))
 		{
 			/* Image was converted */
-			printf("\033[33m16 input_space: %s\033[0m\n", (input_space) ? G_OBJECT_TYPE_NAME(input_space) : "none");
-			printf("\033[33m16 output_space: %s\n\033[0m", (output_space) ? G_OBJECT_TYPE_NAME(output_space) : "none");
 			response = rs_filter_response_clone(previous_response);
 			g_object_unref(previous_response);
 			rs_filter_response_set_image(response, output);
@@ -137,53 +157,60 @@
 	RSFilterResponse *response;
 	RS_IMAGE16 *input;
 	GdkPixbuf *output = NULL;
+	GdkRectangle *roi;
 
 	previous_response = rs_filter_get_image(filter->previous, request);
 	input = rs_filter_response_get_image(previous_response);
 	if (!RS_IS_IMAGE16(input))
 		return previous_response;
 
+	roi = rs_filter_request_get_roi(request);
 	RSColorSpace *input_space = rs_filter_param_get_object_with_type(RS_FILTER_PARAM(previous_response), "colorspace", RS_TYPE_COLOR_SPACE);
 	RSColorSpace *output_space = rs_filter_param_get_object_with_type(RS_FILTER_PARAM(request), "colorspace", RS_TYPE_COLOR_SPACE);
 
+#if 0
 	printf("\033[33m8 input_space: %s\033[0m\n", (input_space) ? G_OBJECT_TYPE_NAME(input_space) : "none");
 	printf("\033[33m8 output_space: %s\n\033[0m", (output_space) ? G_OBJECT_TYPE_NAME(output_space) : "none");
+#endif
 
 	response = rs_filter_response_clone(previous_response);
 	g_object_unref(previous_response);
-	output = gdk_pixbuf_new(GDK_COLORSPACE_RGB, FALSE, 8, input->w, input->h);
+	output = gdk_pixbuf_new(GDK_COLORSPACE_RGB, TRUE, 8, input->w, input->h);
 
-	convert_colorspace8(colorspace_transform, input, output, input_space, output_space);
+	/* Process output */
+	convert_colorspace8(colorspace_transform, input, output, input_space, output_space, roi);
 
 	rs_filter_response_set_image8(response, output);
 	g_object_unref(output);
-
-	/* Process output */
-
 	g_object_unref(input);
 	return response;
 }
 
 static void
-transform8_c(RS_IMAGE16 *input, GdkPixbuf *output, RS_MATRIX3 *matrix, guchar *table8)
+transform8_c(ThreadInfo* t)
 {
 	gint row;
 	gint r,g,b;
 	gint width;
 	RS_MATRIX3Int mati;
-
+	RS_IMAGE16 *input = t->input;
+	GdkPixbuf *output = (GdkPixbuf *)t->output;
+	guchar *table8 = t->table8;
+	
+	g_assert(RS_IS_IMAGE16(input));
+	g_assert(GDK_IS_PIXBUF(output));
 	gint o_channels = gdk_pixbuf_get_n_channels(output);
 
-	matrix3_to_matrix3int(matrix, &mati);
+	matrix3_to_matrix3int(t->matrix, &mati);
 
-	for(row=0 ; row<input->h ; row++)
+	for(row=t->start_y ; row<t->end_y ; row++)
 	{
-		gushort *i = GET_PIXEL(input, 0, row);
-		guchar *o = GET_PIXBUF_PIXEL(output, 0, row);
+		gushort *i = GET_PIXEL(input, t->start_x, row);
+		guchar *o = GET_PIXBUF_PIXEL(output, t->start_x, row);
 
-		width = input->w;
+		width = t->end_x - t->start_x;
 
-		while(width--)
+		while(width-- > 0)
 		{
 			r =
 				( i[R] * mati.coeff[0][0]
@@ -208,12 +235,15 @@
 			o[R] = table8[r];
 			o[G] = table8[g];
 			o[B] = table8[b];
+			o[3] = 255;
+
 			i += input->pixelsize;
 			o += o_channels;
 		}
 	}
 }
 
+
 static void
 transform16_c(gushort* __restrict input, gushort* __restrict output, gint num_pixels, const gint pixelsize, RS_MATRIX3 *matrix)
 {
@@ -301,8 +331,65 @@
 	return TRUE;
 }
 
+gpointer
+start_single_cs8_transform_thread(gpointer _thread_info)
+{
+	ThreadInfo* t = _thread_info;
+	RS_IMAGE16 *input_image = t->input; 
+	GdkPixbuf *output = (GdkPixbuf*) t->output;
+	RSColorSpace *input_space = t->input_space;
+	RSColorSpace *output_space = t->output_space;
+
+	g_assert(RS_IS_IMAGE16(input_image));
+	g_assert(GDK_IS_PIXBUF(output));
+	g_assert(RS_IS_COLOR_SPACE(input_space));
+	g_assert(RS_IS_COLOR_SPACE(output_space));
+
+	gboolean sse2_available = (!!(rs_detect_cpu_features() & RS_CPU_FLAG_SSE2)) && cst_has_sse2();
+
+	if (sse2_available && rs_color_space_new_singleton("RSSrgb") == output_space)
+	{
+		transform8_srgb_sse2(t);
+		return (NULL);
+	}
+	if (sse2_available && rs_color_space_new_singleton("RSAdobeRGB") == output_space)
+	{
+		t->output_gamma = 1.0 / 2.19921875;
+		transform8_otherrgb_sse2(t);
+		return (NULL);
+	}
+	if (sse2_available && rs_color_space_new_singleton("RSProphoto") == output_space)
+	{
+		t->output_gamma = 1.0 / 1.8;
+		transform8_otherrgb_sse2(t);
+		return (NULL);
+	}
+	
+	/* Fall back to C-functions */
+	/* Calculate our gamma table */
+	const RS1dFunction *input_gamma = rs_color_space_get_gamma_function(input_space);
+	const RS1dFunction *output_gamma = rs_color_space_get_gamma_function(output_space);
+	guchar table8[65536];
+	gint i;
+	for(i=0;i<65536;i++)
+	{
+		gdouble nd = ((gdouble) i) * (1.0/65535.0);
+
+		nd = rs_1d_function_evaluate_inverse(input_gamma, nd);
+		nd = rs_1d_function_evaluate(output_gamma, nd);
+
+		/* 8 bit output */
+		gint res = (gint) (nd*255.0 + 0.5f);
+		_CLAMP255(res);
+		table8[i] = res;
+	}
+	t->table8 = table8;
+	transform8_c(t);
+	return (NULL);
+}
+
 static void
-convert_colorspace8(RSColorspaceTransform *colorspace_transform, RS_IMAGE16 *input_image, GdkPixbuf *output_image, RSColorSpace *input_space, RSColorSpace *output_space)
+convert_colorspace8(RSColorspaceTransform *colorspace_transform, RS_IMAGE16 *input_image, GdkPixbuf *output_image, RSColorSpace *input_space, RSColorSpace *output_space, GdkRectangle *_roi)
 {
 	g_assert(RS_IS_IMAGE16(input_image));
 	g_assert(GDK_IS_PIXBUF(output_image));
@@ -313,6 +400,16 @@
 	g_assert(input_image->w == gdk_pixbuf_get_width(output_image));
 	g_assert(input_image->h == gdk_pixbuf_get_height(output_image));
 
+	GdkRectangle *roi = _roi;
+	if (!roi) 
+	{
+		roi = g_new(GdkRectangle, 1);
+		roi->x = 0;
+		roi->y = 0;
+		roi->width = input_image->w;
+		roi->height = input_image->h;
+	}
+
 	/* If a CMS is needed, do the transformation using LCMS */
 	if (RS_COLOR_SPACE_REQUIRES_CMS(input_space) || RS_COLOR_SPACE_REQUIRES_CMS(output_space))
 	{
@@ -332,34 +429,45 @@
 	{
 		const RS_MATRIX3 a = rs_color_space_get_matrix_from_pcs(input_space);
 		const RS_MATRIX3 b = rs_color_space_get_matrix_to_pcs(output_space);
+		
 		RS_MATRIX3 mat;
 		matrix3_multiply(&b, &a, &mat);
 
-		/* Calculate our gamma table */
-		/* FIXME: Move this to someplace where we can cache it! */
-		guchar table8[65536];
 		gint i;
+		guint y_offset, y_per_thread, threaded_h;
+		const guint threads = rs_get_number_of_processor_cores();
+		ThreadInfo *t = g_new(ThreadInfo, threads);
 
-		const RS1dFunction *input_gamma = rs_color_space_get_gamma_function(input_space);
-		const RS1dFunction *output_gamma = rs_color_space_get_gamma_function(output_space);
+		threaded_h = roi->height;
+		y_per_thread = (threaded_h + threads-1)/threads;
+		y_offset = roi->y;
 
-		for(i=0;i<65536;i++)
+		for (i = 0; i < threads; i++)
 		{
-			gdouble nd = ((gdouble) i) * (1.0/65535.0);
+			t[i].input = input_image;
+			t[i].output = output_image;
+			t[i].start_y = y_offset;
+			t[i].start_x = roi->x;
+			t[i].end_x = roi->x + roi->width;
+			t[i].cst = colorspace_transform;
+			t[i].input_space = input_space;
+			t[i].output_space = output_space;
+			y_offset += y_per_thread;
+			y_offset = MIN(input_image->h, y_offset);
+			t[i].end_y = y_offset;
+			t[i].matrix = &mat;
+			t[i].table8 = NULL;
 
-			nd = rs_1d_function_evaluate_inverse(input_gamma, nd);
-			nd = rs_1d_function_evaluate(output_gamma, nd);
-
-			/* 8 bit output */
-			gint res = (gint) (nd*255.0 + 0.5f);
-			_CLAMP255(res);
-			table8[i] = res;
+			t[i].threadid = g_thread_create(start_single_cs8_transform_thread, &t[i], TRUE, NULL);
 		}
 
-		transform8_c(
-			input_image,
-			output_image,
-			&mat,
-			table8);
+		/* Wait for threads to finish */
+		for(i = 0; i < threads; i++)
+			g_thread_join(t[i].threadid);
+
+		g_free(t);
 	}
+	/* If we created the ROI here, free it */
+	if (!_roi) 
+		g_free(roi);
 }

Added: branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform_sse2.c
===================================================================
--- branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform_sse2.c	                        (rev 0)
+++ branches/rawstudio-ng-color/plugins/colorspace-transform/colorspace_transform_sse2.c	2010-01-23 13:57:11 UTC (rev 3057)
@@ -0,0 +1,534 @@
+/*
+ * Copyright (C) 2006-2009 Anders Brander <anders at brander.dk> and 
+ * Anders Kvist <akv at lnxbx.dk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/* Plugin tmpl version 5 */
+
+#include <rawstudio.h>
+#include <lcms.h>
+#include "rs-cmm.h"
+
+typedef struct _RSColorspaceTransform RSColorspaceTransform;
+typedef struct _RSColorspaceTransformClass RSColorspaceTransformClass;
+
+typedef struct {
+	RSColorspaceTransform *cst;
+	GThread *threadid;
+	gint start_x;
+	gint start_y;
+	gint end_x;
+	gint end_y;
+	RS_IMAGE16 *input;
+	void *output;
+	RSColorSpace *input_space;
+	RSColorSpace *output_space;
+	RS_MATRIX3 *matrix;
+	gboolean gamma_correct;
+	guchar* table8;
+	gfloat output_gamma;
+} ThreadInfo;
+
+#if defined(__SSE2__)
+
+#include <emmintrin.h>
+
+
+/* SSE2 Polynomial pow function from Mesa3d (MIT License) */
+
+#define EXP_POLY_DEGREE 3
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+static inline __m128 
+exp2f4(__m128 x)
+{
+	__m128i ipart;
+	__m128 fpart, expipart, expfpart;
+
+	x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+	x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+	/* ipart = int(x - 0.5) */
+	ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+	/* fpart = x - ipart */
+	fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+	/* expipart = (float) (1 << ipart) */
+	expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+	/* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+	expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+	expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+	expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+	expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+	return _mm_mul_ps(expipart, expfpart);
+}
+
+
+#define LOG_POLY_DEGREE 5
+
+static inline __m128 
+log2f4(__m128 x)
+{
+	__m128i exp = _mm_set1_epi32(0x7F800000);
+	__m128i mant = _mm_set1_epi32(0x007FFFFF);
+	__m128 one = _mm_set1_ps( 1.0f);
+	__m128i i = _mm_castps_si128(x);
+	__m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, exp), 23), _mm_set1_epi32(127)));
+	__m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mant)), one);
+	__m128 p;
+
+	/* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */
+#if LOG_POLY_DEGREE == 6
+	p = POLY5( m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+#elif LOG_POLY_DEGREE == 5
+	p = POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+	p = POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+	p = POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+	/* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+	p = _mm_mul_ps(p, _mm_sub_ps(m, one));
+
+	return _mm_add_ps(p, e);
+}
+
+static inline __m128
+_mm_fastpow_ps(__m128 x, __m128 y)
+{
+	return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+/* END: SSE2 Polynomial pow function from Mesa3d (MIT License) */
+
+
+static inline __m128
+sse_matrix3_mul(float* mul, __m128 a, __m128 b, __m128 c)
+{
+	__m128 v = _mm_load_ps(mul);
+	__m128 acc = _mm_mul_ps(a, v);
+
+	v = _mm_load_ps(mul+4);
+	acc = _mm_add_ps(acc, _mm_mul_ps(b, v));
+
+	v = _mm_load_ps(mul+8);
+	acc = _mm_add_ps(acc, _mm_mul_ps(c, v));
+
+	return acc;
+}
+
+
+static gfloat _junction_ps[4] __attribute__ ((aligned (16))) = {0.0031308, 0.0031308, 0.0031308, 0.0031308};
+static gfloat _normalize[4] __attribute__ ((aligned (16))) = {1.0f/65535.0f, 1.0f/65535.0f, 1.0f/65535.0f, 1.0f/65535.0f};
+static gfloat _8bit[4] __attribute__ ((aligned (16))) = {255.5f, 255.5f, 255.5f, 255.5f};
+static gfloat _ones_ps[4] __attribute__ ((aligned (16))) = {1.0f, 1.0f, 1.0f, 1.0f};
+static gfloat _srb_mul_under[4] __attribute__ ((aligned (16))) = {12.92f, 12.92f, 12.92f, 12.92f};
+static gfloat _srb_mul_over[4] __attribute__ ((aligned (16))) = {1.055f, 1.055f, 1.055f, 1.055f};
+static gfloat _srb_sub_over[4] __attribute__ ((aligned (16))) = {0.055f, 0.055f, 0.055f, 0.055f};
+static gfloat _srb_pow_over[4] __attribute__ ((aligned (16))) = {1.0/2.4, 1.0/2.4, 1.0/2.4, 1.0/2.4};
+static guint _alpha_mask[4] __attribute__ ((aligned (16))) = {0xff000000,0xff000000,0xff000000,0xff000000};
+
+void
+transform8_srgb_sse2(ThreadInfo* t)
+{
+	RS_IMAGE16 *input = t->input;
+	GdkPixbuf *output = t->output;
+	RS_MATRIX3 *matrix = t->matrix;
+	gint x,y;
+	gint width;
+
+	float mat_ps[4*4*3] __attribute__ ((aligned (16)));
+	for (x = 0; x < 4; x++ ) {
+		mat_ps[x] = matrix->coeff[0][0];
+		mat_ps[x+4] = matrix->coeff[0][1];
+		mat_ps[x+8] = matrix->coeff[0][2];
+		mat_ps[12+x] = matrix->coeff[1][0];
+		mat_ps[12+x+4] = matrix->coeff[1][1];
+		mat_ps[12+x+8] = matrix->coeff[1][2];
+		mat_ps[24+x] = matrix->coeff[2][0];
+		mat_ps[24+x+4] = matrix->coeff[2][1];
+		mat_ps[24+x+8] = matrix->coeff[2][2];
+	}
+	
+	int start_x = t->start_x;
+	/* Always have aligned input and output adress */
+	if (start_x & 3)
+		start_x = ((start_x) / 4) * 4;
+	
+	int complete_w = t->end_x - start_x;
+	/* If width is not multiple of 4, check if we can extend it a bit */
+	if (complete_w & 3)
+	{
+		if ((t->end_x+4) < input->w)
+			complete_w = ((complete_w+3) / 4 *4);
+	}
+	
+	for(y=t->start_y ; y<t->end_y ; y++)
+	{
+		gushort *i = GET_PIXEL(input, start_x, y);
+		guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
+		gboolean aligned_write = !((guintptr)(o)&0xf);
+
+		width = complete_w >> 2;
+
+		while(width--)
+		{
+			/* Load and convert to float */
+			__m128i zero = _mm_setzero_si128();
+			__m128i in = _mm_load_si128((__m128i*)i); // Load two pixels
+			__m128i in2 = _mm_load_si128((__m128i*)i+1); // Load two pixels
+			_mm_prefetch(i + 64, _MM_HINT_NTA);
+			__m128i p1 =_mm_unpacklo_epi16(in, zero);
+			__m128i p2 =_mm_unpackhi_epi16(in, zero);
+			__m128i p3 =_mm_unpacklo_epi16(in2, zero);
+			__m128i p4 =_mm_unpackhi_epi16(in2, zero);
+			__m128 p1f  = _mm_cvtepi32_ps(p1);
+			__m128 p2f  = _mm_cvtepi32_ps(p2);
+			__m128 p3f  = _mm_cvtepi32_ps(p3);
+			__m128 p4f  = _mm_cvtepi32_ps(p4);
+			
+			/* Convert to planar */
+			__m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
+			__m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
+			__m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
+			__m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
+			__m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
+			__m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
+			__m128 b = _mm_movelh_ps(b1b0, b3b2);
+
+			/* Apply matrix to convert to sRGB */
+			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+			/* Normalize to 0->1 and clamp */
+			__m128 normalize = _mm_load_ps(_normalize);
+			__m128 max_val = _mm_load_ps(_ones_ps);
+			__m128 min_val = _mm_setzero_ps();
+			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2)));
+			g = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, g2)));
+			b = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, b2)));
+
+			/* Apply Gamma */
+			/* Calculate values to be used if larger than junction point */
+			__m128 mul_over = _mm_load_ps(_srb_mul_over);
+			__m128 sub_over = _mm_load_ps(_srb_sub_over);
+			__m128 pow_over = _mm_load_ps(_srb_pow_over);
+			__m128 r_gam = _mm_sub_ps(_mm_mul_ps( mul_over, _mm_fastpow_ps(r, pow_over)), sub_over);
+			__m128 g_gam = _mm_sub_ps(_mm_mul_ps( mul_over, _mm_fastpow_ps(g, pow_over)), sub_over);
+			__m128 b_gam = _mm_sub_ps(_mm_mul_ps( mul_over, _mm_fastpow_ps(b, pow_over)), sub_over);
+
+			/* Create mask for values smaller than junction point */
+			__m128 junction = _mm_load_ps(_junction_ps);
+			__m128 mask_r = _mm_cmplt_ps(r, junction);
+			__m128 mask_g = _mm_cmplt_ps(g, junction);
+			__m128 mask_b = _mm_cmplt_ps(b, junction);
+
+			/* Calculate value to be used if under junction */
+			__m128 mul_under = _mm_load_ps(_srb_mul_under);
+			__m128 r_mul = _mm_and_ps(mask_r, _mm_mul_ps(mul_under, r));
+			__m128 g_mul = _mm_and_ps(mask_g, _mm_mul_ps(mul_under, g));
+			__m128 b_mul = _mm_and_ps(mask_b, _mm_mul_ps(mul_under, b));
+
+			/* Select the value to be used based on the junction mask and scale to 8 bit */
+			__m128 upscale = _mm_load_ps(_8bit);
+			r = _mm_mul_ps(upscale, _mm_or_ps(r_mul, _mm_andnot_ps(mask_r, r_gam)));
+			g = _mm_mul_ps(upscale, _mm_or_ps(g_mul, _mm_andnot_ps(mask_g, g_gam)));
+			b = _mm_mul_ps(upscale, _mm_or_ps(b_mul, _mm_andnot_ps(mask_b, b_gam)));
+			
+			/* Convert to 8 bit unsigned  and interleave*/
+			__m128i r_i = _mm_cvtps_epi32(r);
+			__m128i g_i = _mm_cvtps_epi32(g);
+			__m128i b_i = _mm_cvtps_epi32(b);
+			
+			r_i = _mm_packs_epi32(r_i, r_i);
+			g_i = _mm_packs_epi32(g_i, g_i);
+			b_i = _mm_packs_epi32(b_i, b_i);
+
+			/* Set alpha value to 255 and store */
+			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
+			__m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
+			__m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
+			p1 = _mm_unpacklo_epi32(rg_i, bb_i);
+			p2 = _mm_unpackhi_epi32(rg_i, bb_i);
+	
+			p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));
+
+			if (aligned_write)
+				_mm_store_si128((__m128i*)o, p1);
+			else
+				_mm_storeu_si128((__m128i*)o, p1);
+
+			i += 16;
+			o += 16;
+		}
+		/* Process remaining pixels */
+		width = complete_w & 3;
+		while(width--)
+		{
+			__m128i zero = _mm_setzero_si128();
+			__m128i in = _mm_loadl_epi64((__m128i*)i); // Load two pixels
+			__m128i p1 =_mm_unpacklo_epi16(in, zero);
+			__m128 p1f  = _mm_cvtepi32_ps(p1);
+
+			/* Splat r,g,b */
+			__m128 r =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(4,4,4,4));
+			__m128 g =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(3,3,3,3));
+			__m128 b =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(2,2,2,2));
+
+			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+			r = _mm_unpacklo_ps(r2, g2);	// GG RR GG RR
+			r = _mm_movelh_ps(r, b2);		// BB BB GG RR
+
+			__m128 normalize = _mm_load_ps(_normalize);
+			__m128 max_val = _mm_load_ps(_ones_ps);
+			__m128 min_val = _mm_setzero_ps();
+			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2)));
+			__m128 mul_over = _mm_load_ps(_srb_mul_over);
+			__m128 sub_over = _mm_load_ps(_srb_sub_over);
+			__m128 pow_over = _mm_load_ps(_srb_pow_over);
+			__m128 r_gam = _mm_sub_ps(_mm_mul_ps( mul_over, _mm_fastpow_ps(r, pow_over)), sub_over);
+			__m128 junction = _mm_load_ps(_junction_ps);
+			__m128 mask_r = _mm_cmplt_ps(r, junction);
+			__m128 mul_under = _mm_load_ps(_srb_mul_under);
+			__m128 r_mul = _mm_and_ps(mask_r, _mm_mul_ps(mul_under, r));
+			__m128 upscale = _mm_load_ps(_8bit);
+			r = _mm_mul_ps(upscale, _mm_or_ps(r_mul, _mm_andnot_ps(mask_r, r_gam)));
+			
+			/* Convert to 8 bit unsigned */
+			zero = _mm_setzero_si128();
+			__m128i r_i = _mm_cvtps_epi32(r);
+			/* To 16 bit signed */
+			r_i = _mm_packs_epi32(r_i, zero);
+			/* To 8 bit unsigned - set alpha channel*/
+			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
+			r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, zero));
+			*(int*)o = _mm_cvtsi128_si32(r_i);
+			i+=4;
+			o+=4;
+		}
+	}
+}
+
+
+void
+transform8_otherrgb_sse2(ThreadInfo* t)
+{
+	RS_IMAGE16 *input = t->input;
+	GdkPixbuf *output = t->output;
+	RS_MATRIX3 *matrix = t->matrix;
+	gint x,y;
+	gint width;
+
+	float mat_ps[4*4*3] __attribute__ ((aligned (16)));
+	for (x = 0; x < 4; x++ ) {
+		mat_ps[x] = matrix->coeff[0][0];
+		mat_ps[x+4] = matrix->coeff[0][1];
+		mat_ps[x+8] = matrix->coeff[0][2];
+		mat_ps[12+x] = matrix->coeff[1][0];
+		mat_ps[12+x+4] = matrix->coeff[1][1];
+		mat_ps[12+x+8] = matrix->coeff[1][2];
+		mat_ps[24+x] = matrix->coeff[2][0];
+		mat_ps[24+x+4] = matrix->coeff[2][1];
+		mat_ps[24+x+8] = matrix->coeff[2][2];
+	}
+	
+	int start_x = t->start_x;
+	/* Always have aligned input and output adress */
+	if (start_x & 3)
+		start_x = ((start_x) / 4) * 4;
+	
+	int complete_w = t->end_x - start_x;
+	/* If width is not multiple of 4, check if we can extend it a bit */
+	if (complete_w & 3)
+	{
+		if ((t->end_x+4) < input->w)
+			complete_w = ((complete_w+3) / 4 *4);
+	}
+	__m128 gamma = _mm_set1_ps(t->output_gamma);
+
+	for(y=t->start_y ; y<t->end_y ; y++)
+	{
+		gushort *i = GET_PIXEL(input, start_x, y);
+		guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
+		gboolean aligned_write = !((guintptr)(o)&0xf);
+
+		width = complete_w >> 2;
+
+		while(width--)
+		{
+			/* Load and convert to float */
+			__m128i zero = _mm_setzero_si128();
+			__m128i in = _mm_load_si128((__m128i*)i); // Load two pixels
+			__m128i in2 = _mm_load_si128((__m128i*)i+1); // Load two pixels
+			_mm_prefetch(i + 64, _MM_HINT_NTA);
+			__m128i p1 =_mm_unpacklo_epi16(in, zero);
+			__m128i p2 =_mm_unpackhi_epi16(in, zero);
+			__m128i p3 =_mm_unpacklo_epi16(in2, zero);
+			__m128i p4 =_mm_unpackhi_epi16(in2, zero);
+			__m128 p1f  = _mm_cvtepi32_ps(p1);
+			__m128 p2f  = _mm_cvtepi32_ps(p2);
+			__m128 p3f  = _mm_cvtepi32_ps(p3);
+			__m128 p4f  = _mm_cvtepi32_ps(p4);
+			
+			/* Convert to planar */
+			__m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
+			__m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
+			__m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
+			__m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
+			__m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
+			__m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
+			__m128 b = _mm_movelh_ps(b1b0, b3b2);
+
+			/* Apply matrix to convert to sRGB */
+			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+			/* Normalize to 0->1 and clamp */
+			__m128 normalize = _mm_load_ps(_normalize);
+			__m128 max_val = _mm_load_ps(_ones_ps);
+			__m128 min_val = _mm_setzero_ps();
+			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2)));
+			g = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, g2)));
+			b = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, b2)));
+
+			/* Apply Gamma */
+			__m128 upscale = _mm_load_ps(_8bit);
+			r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
+			g = _mm_mul_ps(upscale, _mm_fastpow_ps(g, gamma));
+			b = _mm_mul_ps(upscale, _mm_fastpow_ps(b, gamma));
+
+			/* Convert to 8 bit unsigned  and interleave*/
+			__m128i r_i = _mm_cvtps_epi32(r);
+			__m128i g_i = _mm_cvtps_epi32(g);
+			__m128i b_i = _mm_cvtps_epi32(b);
+			
+			r_i = _mm_packs_epi32(r_i, r_i);
+			g_i = _mm_packs_epi32(g_i, g_i);
+			b_i = _mm_packs_epi32(b_i, b_i);
+
+			/* Set alpha value to 255 and store */
+			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
+			__m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
+			__m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
+			p1 = _mm_unpacklo_epi32(rg_i, bb_i);
+			p2 = _mm_unpackhi_epi32(rg_i, bb_i);
+	
+			p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));
+
+			if (aligned_write)
+				_mm_store_si128((__m128i*)o, p1);
+			else
+				_mm_storeu_si128((__m128i*)o, p1);
+
+			i += 16;
+			o += 16;
+		}
+		/* Process remaining pixels */
+		width = complete_w & 3;
+		while(width--)
+		{
+			__m128i zero = _mm_setzero_si128();
+			__m128i in = _mm_loadl_epi64((__m128i*)i); // Load two pixels
+			__m128i p1 =_mm_unpacklo_epi16(in, zero);
+			__m128 p1f  = _mm_cvtepi32_ps(p1);
+
+			/* Splat r,g,b */
+			__m128 r =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(4,4,4,4));
+			__m128 g =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(3,3,3,3));
+			__m128 b =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(2,2,2,2));
+
+			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
+			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
+			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);
+
+			r = _mm_unpacklo_ps(r2, g2);	// GG RR GG RR
+			r = _mm_movelh_ps(r, b2);		// BB BB GG RR
+
+			__m128 normalize = _mm_load_ps(_normalize);
+			__m128 max_val = _mm_load_ps(_ones_ps);
+			__m128 min_val = _mm_setzero_ps();
+			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2)));
+			__m128 upscale = _mm_load_ps(_8bit);
+			r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
+			
+			/* Convert to 8 bit unsigned */
+			zero = _mm_setzero_si128();
+			__m128i r_i = _mm_cvtps_epi32(r);
+			/* To 16 bit signed */
+			r_i = _mm_packs_epi32(r_i, zero);
+			/* To 8 bit unsigned - set alpha channel*/
+			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
+			r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, zero));
+			*(int*)o = _mm_cvtsi128_si32(r_i);
+			i+=4;
+			o+=4;
+		}
+	}
+}
+
+gboolean cst_has_sse2() 
+{
+	return TRUE;
+}
+
+#else // !defined __SSE2__
+
+/* Provide empty functions if not SSE2 compiled to avoid linker errors */
+
+void transform8_sse2(ThreadInfo* t)
+{
+	/* We should never even get here */
+	g_assert(FALSE);
+}
+
+void
+transform8_otherrgb_sse2(ThreadInfo* t)
+{
+	/* We should never even get here */
+	g_assert(FALSE);
+}
+
+gboolean cst_has_sse2() 
+{
+	return FALSE;
+}
+
+#endif
\ No newline at end of file




More information about the Rawstudio-commit mailing list