You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1070 lines
41 KiB

  1. ///////////////////////////////////////////////////////////////////////////////////
  2. /// OpenGL Mathematics (glm.g-truc.net)
  3. ///
  4. /// Copyright (c) 2005 - 2013 G-Truc Creation (www.g-truc.net)
  5. /// Permission is hereby granted, free of charge, to any person obtaining a copy
  6. /// of this software and associated documentation files (the "Software"), to deal
  7. /// in the Software without restriction, including without limitation the rights
  8. /// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. /// copies of the Software, and to permit persons to whom the Software is
  10. /// furnished to do so, subject to the following conditions:
  11. ///
  12. /// The above copyright notice and this permission notice shall be included in
  13. /// all copies or substantial portions of the Software.
  14. ///
  15. /// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. /// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. /// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. /// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. /// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. /// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  21. /// THE SOFTWARE.
  22. ///
  23. /// @ref core
  24. /// @file glm/core/intrinsic_common.inl
  25. /// @date 2009-06-05 / 2011-06-15
  26. /// @author Christophe Riccio
  27. ///////////////////////////////////////////////////////////////////////////////////
  28. namespace glm{
  29. namespace detail{
  30. static const __m128 GLM_VAR_USED _m128_rad_ps = _mm_set_ps1(3.141592653589793238462643383279f / 180.f);
  31. static const __m128 GLM_VAR_USED _m128_deg_ps = _mm_set_ps1(180.f / 3.141592653589793238462643383279f);
  32. template <typename matType>
  33. GLM_FUNC_QUALIFIER matType sse_comp_mul_ps
  34. (
  35. __m128 const in1[4],
  36. __m128 const in2[4],
  37. __m128 out[4]
  38. )
  39. {
  40. out[0] = _mm_mul_ps(in1[0], in2[0]);
  41. out[1] = _mm_mul_ps(in1[1], in2[1]);
  42. out[2] = _mm_mul_ps(in1[2], in2[2]);
  43. out[3] = _mm_mul_ps(in1[3], in2[3]);
  44. }
  45. GLM_FUNC_QUALIFIER void sse_add_ps(__m128 const in1[4], __m128 const in2[4], __m128 out[4])
  46. {
  47. {
  48. out[0] = _mm_add_ps(in1[0], in2[0]);
  49. out[1] = _mm_add_ps(in1[1], in2[1]);
  50. out[2] = _mm_add_ps(in1[2], in2[2]);
  51. out[3] = _mm_add_ps(in1[3], in2[3]);
  52. }
  53. }
  54. GLM_FUNC_QUALIFIER void sse_sub_ps(__m128 const in1[4], __m128 const in2[4], __m128 out[4])
  55. {
  56. {
  57. out[0] = _mm_sub_ps(in1[0], in2[0]);
  58. out[1] = _mm_sub_ps(in1[1], in2[1]);
  59. out[2] = _mm_sub_ps(in1[2], in2[2]);
  60. out[3] = _mm_sub_ps(in1[3], in2[3]);
  61. }
  62. }
  63. GLM_FUNC_QUALIFIER __m128 sse_mul_ps(__m128 const m[4], __m128 v)
  64. {
  65. __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
  66. __m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
  67. __m128 v2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
  68. __m128 v3 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
  69. __m128 m0 = _mm_mul_ps(m[0], v0);
  70. __m128 m1 = _mm_mul_ps(m[1], v1);
  71. __m128 m2 = _mm_mul_ps(m[2], v2);
  72. __m128 m3 = _mm_mul_ps(m[3], v3);
  73. __m128 a0 = _mm_add_ps(m0, m1);
  74. __m128 a1 = _mm_add_ps(m2, m3);
  75. __m128 a2 = _mm_add_ps(a0, a1);
  76. return a2;
  77. }
  78. GLM_FUNC_QUALIFIER __m128 sse_mul_ps(__m128 v, __m128 const m[4])
  79. {
  80. __m128 i0 = m[0];
  81. __m128 i1 = m[1];
  82. __m128 i2 = m[2];
  83. __m128 i3 = m[3];
  84. __m128 m0 = _mm_mul_ps(v, i0);
  85. __m128 m1 = _mm_mul_ps(v, i1);
  86. __m128 m2 = _mm_mul_ps(v, i2);
  87. __m128 m3 = _mm_mul_ps(v, i3);
  88. __m128 u0 = _mm_unpacklo_ps(m0, m1);
  89. __m128 u1 = _mm_unpackhi_ps(m0, m1);
  90. __m128 a0 = _mm_add_ps(u0, u1);
  91. __m128 u2 = _mm_unpacklo_ps(m2, m3);
  92. __m128 u3 = _mm_unpackhi_ps(m2, m3);
  93. __m128 a1 = _mm_add_ps(u2, u3);
  94. __m128 f0 = _mm_movelh_ps(a0, a1);
  95. __m128 f1 = _mm_movehl_ps(a1, a0);
  96. __m128 f2 = _mm_add_ps(f0, f1);
  97. return f2;
  98. }
  99. GLM_FUNC_QUALIFIER void sse_mul_ps(__m128 const in1[4], __m128 const in2[4], __m128 out[4])
  100. {
  101. {
  102. __m128 e0 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(0, 0, 0, 0));
  103. __m128 e1 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(1, 1, 1, 1));
  104. __m128 e2 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(2, 2, 2, 2));
  105. __m128 e3 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(3, 3, 3, 3));
  106. __m128 m0 = _mm_mul_ps(in1[0], e0);
  107. __m128 m1 = _mm_mul_ps(in1[1], e1);
  108. __m128 m2 = _mm_mul_ps(in1[2], e2);
  109. __m128 m3 = _mm_mul_ps(in1[3], e3);
  110. __m128 a0 = _mm_add_ps(m0, m1);
  111. __m128 a1 = _mm_add_ps(m2, m3);
  112. __m128 a2 = _mm_add_ps(a0, a1);
  113. out[0] = a2;
  114. }
  115. {
  116. __m128 e0 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(0, 0, 0, 0));
  117. __m128 e1 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(1, 1, 1, 1));
  118. __m128 e2 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(2, 2, 2, 2));
  119. __m128 e3 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(3, 3, 3, 3));
  120. __m128 m0 = _mm_mul_ps(in1[0], e0);
  121. __m128 m1 = _mm_mul_ps(in1[1], e1);
  122. __m128 m2 = _mm_mul_ps(in1[2], e2);
  123. __m128 m3 = _mm_mul_ps(in1[3], e3);
  124. __m128 a0 = _mm_add_ps(m0, m1);
  125. __m128 a1 = _mm_add_ps(m2, m3);
  126. __m128 a2 = _mm_add_ps(a0, a1);
  127. out[1] = a2;
  128. }
  129. {
  130. __m128 e0 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(0, 0, 0, 0));
  131. __m128 e1 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(1, 1, 1, 1));
  132. __m128 e2 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(2, 2, 2, 2));
  133. __m128 e3 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(3, 3, 3, 3));
  134. __m128 m0 = _mm_mul_ps(in1[0], e0);
  135. __m128 m1 = _mm_mul_ps(in1[1], e1);
  136. __m128 m2 = _mm_mul_ps(in1[2], e2);
  137. __m128 m3 = _mm_mul_ps(in1[3], e3);
  138. __m128 a0 = _mm_add_ps(m0, m1);
  139. __m128 a1 = _mm_add_ps(m2, m3);
  140. __m128 a2 = _mm_add_ps(a0, a1);
  141. out[2] = a2;
  142. }
  143. {
  144. //(__m128&)_mm_shuffle_epi32(__m128i&)in2[0], _MM_SHUFFLE(3, 3, 3, 3))
  145. __m128 e0 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(0, 0, 0, 0));
  146. __m128 e1 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(1, 1, 1, 1));
  147. __m128 e2 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(2, 2, 2, 2));
  148. __m128 e3 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(3, 3, 3, 3));
  149. __m128 m0 = _mm_mul_ps(in1[0], e0);
  150. __m128 m1 = _mm_mul_ps(in1[1], e1);
  151. __m128 m2 = _mm_mul_ps(in1[2], e2);
  152. __m128 m3 = _mm_mul_ps(in1[3], e3);
  153. __m128 a0 = _mm_add_ps(m0, m1);
  154. __m128 a1 = _mm_add_ps(m2, m3);
  155. __m128 a2 = _mm_add_ps(a0, a1);
  156. out[3] = a2;
  157. }
  158. }
  159. GLM_FUNC_QUALIFIER void sse_transpose_ps(__m128 const in[4], __m128 out[4])
  160. {
  161. __m128 tmp0 = _mm_shuffle_ps(in[0], in[1], 0x44);
  162. __m128 tmp2 = _mm_shuffle_ps(in[0], in[1], 0xEE);
  163. __m128 tmp1 = _mm_shuffle_ps(in[2], in[3], 0x44);
  164. __m128 tmp3 = _mm_shuffle_ps(in[2], in[3], 0xEE);
  165. out[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
  166. out[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
  167. out[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
  168. out[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
  169. }
  170. GLM_FUNC_QUALIFIER __m128 sse_slow_det_ps(__m128 const in[4])
  171. {
  172. __m128 Fac0;
  173. {
  174. // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  175. // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  176. // valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
  177. // valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
  178. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  179. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  180. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  181. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  182. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  183. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  184. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  185. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  186. Fac0 = _mm_sub_ps(Mul00, Mul01);
  187. }
  188. __m128 Fac1;
  189. {
  190. // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  191. // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  192. // valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
  193. // valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
  194. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  195. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  196. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  197. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  198. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  199. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  200. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  201. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  202. Fac1 = _mm_sub_ps(Mul00, Mul01);
  203. }
  204. __m128 Fac2;
  205. {
  206. // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  207. // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  208. // valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
  209. // valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
  210. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  211. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  212. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  213. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  214. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  215. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  216. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  217. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  218. Fac2 = _mm_sub_ps(Mul00, Mul01);
  219. }
  220. __m128 Fac3;
  221. {
  222. // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  223. // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  224. // valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
  225. // valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
  226. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  227. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  228. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  229. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  230. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  231. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  232. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  233. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  234. Fac3 = _mm_sub_ps(Mul00, Mul01);
  235. }
  236. __m128 Fac4;
  237. {
  238. // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  239. // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  240. // valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
  241. // valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
  242. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  243. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  244. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  245. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  246. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  247. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  248. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  249. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  250. Fac4 = _mm_sub_ps(Mul00, Mul01);
  251. }
  252. __m128 Fac5;
  253. {
  254. // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  255. // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  256. // valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
  257. // valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
  258. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  259. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  260. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  261. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  262. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  263. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  264. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  265. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  266. Fac5 = _mm_sub_ps(Mul00, Mul01);
  267. }
  268. __m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
  269. __m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
  270. // m[1][0]
  271. // m[0][0]
  272. // m[0][0]
  273. // m[0][0]
  274. __m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
  275. __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
  276. // m[1][1]
  277. // m[0][1]
  278. // m[0][1]
  279. // m[0][1]
  280. __m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
  281. __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
  282. // m[1][2]
  283. // m[0][2]
  284. // m[0][2]
  285. // m[0][2]
  286. __m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
  287. __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
  288. // m[1][3]
  289. // m[0][3]
  290. // m[0][3]
  291. // m[0][3]
  292. __m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
  293. __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
  294. // col0
  295. // + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
  296. // - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
  297. // + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
  298. // - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
  299. __m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
  300. __m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
  301. __m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
  302. __m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
  303. __m128 Add00 = _mm_add_ps(Sub00, Mul02);
  304. __m128 Inv0 = _mm_mul_ps(SignB, Add00);
  305. // col1
  306. // - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
  307. // + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
  308. // - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
  309. // + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
  310. __m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
  311. __m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
  312. __m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
  313. __m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
  314. __m128 Add01 = _mm_add_ps(Sub01, Mul05);
  315. __m128 Inv1 = _mm_mul_ps(SignA, Add01);
  316. // col2
  317. // + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
  318. // - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
  319. // + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
  320. // - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
  321. __m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
  322. __m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
  323. __m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
  324. __m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
  325. __m128 Add02 = _mm_add_ps(Sub02, Mul08);
  326. __m128 Inv2 = _mm_mul_ps(SignB, Add02);
  327. // col3
  328. // - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
  329. // + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
  330. // - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
  331. // + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
  332. __m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
  333. __m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
  334. __m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
  335. __m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
  336. __m128 Add03 = _mm_add_ps(Sub03, Mul11);
  337. __m128 Inv3 = _mm_mul_ps(SignA, Add03);
  338. __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
  339. __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
  340. __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
  341. // valType Determinant = m[0][0] * Inverse[0][0]
  342. // + m[0][1] * Inverse[1][0]
  343. // + m[0][2] * Inverse[2][0]
  344. // + m[0][3] * Inverse[3][0];
  345. __m128 Det0 = sse_dot_ps(in[0], Row2);
  346. return Det0;
  347. }
  348. GLM_FUNC_QUALIFIER __m128 sse_detd_ps
  349. (
  350. __m128 const m[4]
  351. )
  352. {
  353. // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(
  354. //T SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  355. //T SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  356. //T SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  357. //T SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  358. //T SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  359. //T SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  360. // First 2 columns
  361. __m128 Swp2A = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(0, 1, 1, 2)));
  362. __m128 Swp3A = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(3, 2, 3, 3)));
  363. __m128 MulA = _mm_mul_ps(Swp2A, Swp3A);
  364. // Second 2 columns
  365. __m128 Swp2B = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(3, 2, 3, 3)));
  366. __m128 Swp3B = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(0, 1, 1, 2)));
  367. __m128 MulB = _mm_mul_ps(Swp2B, Swp3B);
  368. // Columns subtraction
  369. __m128 SubE = _mm_sub_ps(MulA, MulB);
  370. // Last 2 rows
  371. __m128 Swp2C = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(0, 0, 1, 2)));
  372. __m128 Swp3C = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(1, 2, 0, 0)));
  373. __m128 MulC = _mm_mul_ps(Swp2C, Swp3C);
  374. __m128 SubF = _mm_sub_ps(_mm_movehl_ps(MulC, MulC), MulC);
  375. //detail::tvec4<T> DetCof(
  376. // + (m[1][1] * SubFactor00 - m[1][2] * SubFactor01 + m[1][3] * SubFactor02),
  377. // - (m[1][0] * SubFactor00 - m[1][2] * SubFactor03 + m[1][3] * SubFactor04),
  378. // + (m[1][0] * SubFactor01 - m[1][1] * SubFactor03 + m[1][3] * SubFactor05),
  379. // - (m[1][0] * SubFactor02 - m[1][1] * SubFactor04 + m[1][2] * SubFactor05));
  380. __m128 SubFacA = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubE), _MM_SHUFFLE(2, 1, 0, 0)));
  381. __m128 SwpFacA = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(0, 0, 0, 1)));
  382. __m128 MulFacA = _mm_mul_ps(SwpFacA, SubFacA);
  383. __m128 SubTmpB = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(0, 0, 3, 1));
  384. __m128 SubFacB = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubTmpB), _MM_SHUFFLE(3, 1, 1, 0)));//SubF[0], SubE[3], SubE[3], SubE[1];
  385. __m128 SwpFacB = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(1, 1, 2, 2)));
  386. __m128 MulFacB = _mm_mul_ps(SwpFacB, SubFacB);
  387. __m128 SubRes = _mm_sub_ps(MulFacA, MulFacB);
  388. __m128 SubTmpC = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(1, 0, 2, 2));
  389. __m128 SubFacC = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubTmpC), _MM_SHUFFLE(3, 3, 2, 0)));
  390. __m128 SwpFacC = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(2, 3, 3, 3)));
  391. __m128 MulFacC = _mm_mul_ps(SwpFacC, SubFacC);
  392. __m128 AddRes = _mm_add_ps(SubRes, MulFacC);
  393. __m128 DetCof = _mm_mul_ps(AddRes, _mm_setr_ps( 1.0f,-1.0f, 1.0f,-1.0f));
  394. //return m[0][0] * DetCof[0]
  395. // + m[0][1] * DetCof[1]
  396. // + m[0][2] * DetCof[2]
  397. // + m[0][3] * DetCof[3];
  398. return sse_dot_ps(m[0], DetCof);
  399. }
  400. GLM_FUNC_QUALIFIER __m128 sse_det_ps
  401. (
  402. __m128 const m[4]
  403. )
  404. {
  405. // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(add)
  406. //T SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  407. //T SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  408. //T SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  409. //T SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  410. //T SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  411. //T SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  412. // First 2 columns
  413. __m128 Swp2A = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(0, 1, 1, 2));
  414. __m128 Swp3A = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(3, 2, 3, 3));
  415. __m128 MulA = _mm_mul_ps(Swp2A, Swp3A);
  416. // Second 2 columns
  417. __m128 Swp2B = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(3, 2, 3, 3));
  418. __m128 Swp3B = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(0, 1, 1, 2));
  419. __m128 MulB = _mm_mul_ps(Swp2B, Swp3B);
  420. // Columns subtraction
  421. __m128 SubE = _mm_sub_ps(MulA, MulB);
  422. // Last 2 rows
  423. __m128 Swp2C = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(0, 0, 1, 2));
  424. __m128 Swp3C = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(1, 2, 0, 0));
  425. __m128 MulC = _mm_mul_ps(Swp2C, Swp3C);
  426. __m128 SubF = _mm_sub_ps(_mm_movehl_ps(MulC, MulC), MulC);
  427. //detail::tvec4<T> DetCof(
  428. // + (m[1][1] * SubFactor00 - m[1][2] * SubFactor01 + m[1][3] * SubFactor02),
  429. // - (m[1][0] * SubFactor00 - m[1][2] * SubFactor03 + m[1][3] * SubFactor04),
  430. // + (m[1][0] * SubFactor01 - m[1][1] * SubFactor03 + m[1][3] * SubFactor05),
  431. // - (m[1][0] * SubFactor02 - m[1][1] * SubFactor04 + m[1][2] * SubFactor05));
  432. __m128 SubFacA = _mm_shuffle_ps(SubE, SubE, _MM_SHUFFLE(2, 1, 0, 0));
  433. __m128 SwpFacA = _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(0, 0, 0, 1));
  434. __m128 MulFacA = _mm_mul_ps(SwpFacA, SubFacA);
  435. __m128 SubTmpB = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(0, 0, 3, 1));
  436. __m128 SubFacB = _mm_shuffle_ps(SubTmpB, SubTmpB, _MM_SHUFFLE(3, 1, 1, 0));//SubF[0], SubE[3], SubE[3], SubE[1];
  437. __m128 SwpFacB = _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(1, 1, 2, 2));
  438. __m128 MulFacB = _mm_mul_ps(SwpFacB, SubFacB);
  439. __m128 SubRes = _mm_sub_ps(MulFacA, MulFacB);
  440. __m128 SubTmpC = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(1, 0, 2, 2));
  441. __m128 SubFacC = _mm_shuffle_ps(SubTmpC, SubTmpC, _MM_SHUFFLE(3, 3, 2, 0));
  442. __m128 SwpFacC = _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(2, 3, 3, 3));
  443. __m128 MulFacC = _mm_mul_ps(SwpFacC, SubFacC);
  444. __m128 AddRes = _mm_add_ps(SubRes, MulFacC);
  445. __m128 DetCof = _mm_mul_ps(AddRes, _mm_setr_ps( 1.0f,-1.0f, 1.0f,-1.0f));
  446. //return m[0][0] * DetCof[0]
  447. // + m[0][1] * DetCof[1]
  448. // + m[0][2] * DetCof[2]
  449. // + m[0][3] * DetCof[3];
  450. return sse_dot_ps(m[0], DetCof);
  451. }
  452. GLM_FUNC_QUALIFIER void sse_inverse_ps(__m128 const in[4], __m128 out[4])
  453. {
  454. __m128 Fac0;
  455. {
  456. // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  457. // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  458. // valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
  459. // valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
  460. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  461. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  462. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  463. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  464. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  465. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  466. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  467. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  468. Fac0 = _mm_sub_ps(Mul00, Mul01);
  469. }
  470. __m128 Fac1;
  471. {
  472. // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  473. // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  474. // valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
  475. // valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
  476. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  477. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  478. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  479. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  480. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  481. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  482. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  483. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  484. Fac1 = _mm_sub_ps(Mul00, Mul01);
  485. }
  486. __m128 Fac2;
  487. {
  488. // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  489. // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  490. // valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
  491. // valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
  492. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  493. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  494. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  495. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  496. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  497. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  498. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  499. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  500. Fac2 = _mm_sub_ps(Mul00, Mul01);
  501. }
  502. __m128 Fac3;
  503. {
  504. // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  505. // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  506. // valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
  507. // valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
  508. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  509. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  510. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  511. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  512. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  513. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  514. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  515. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  516. Fac3 = _mm_sub_ps(Mul00, Mul01);
  517. }
  518. __m128 Fac4;
  519. {
  520. // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  521. // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  522. // valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
  523. // valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
  524. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  525. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  526. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  527. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  528. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  529. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  530. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  531. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  532. Fac4 = _mm_sub_ps(Mul00, Mul01);
  533. }
  534. __m128 Fac5;
  535. {
  536. // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  537. // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  538. // valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
  539. // valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
  540. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  541. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  542. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  543. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  544. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  545. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  546. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  547. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  548. Fac5 = _mm_sub_ps(Mul00, Mul01);
  549. }
  550. __m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
  551. __m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
  552. // m[1][0]
  553. // m[0][0]
  554. // m[0][0]
  555. // m[0][0]
  556. __m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
  557. __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
  558. // m[1][1]
  559. // m[0][1]
  560. // m[0][1]
  561. // m[0][1]
  562. __m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
  563. __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
  564. // m[1][2]
  565. // m[0][2]
  566. // m[0][2]
  567. // m[0][2]
  568. __m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
  569. __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
  570. // m[1][3]
  571. // m[0][3]
  572. // m[0][3]
  573. // m[0][3]
  574. __m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
  575. __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
  576. // col0
  577. // + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
  578. // - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
  579. // + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
  580. // - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
  581. __m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
  582. __m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
  583. __m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
  584. __m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
  585. __m128 Add00 = _mm_add_ps(Sub00, Mul02);
  586. __m128 Inv0 = _mm_mul_ps(SignB, Add00);
  587. // col1
  588. // - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
  589. // + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
  590. // - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
  591. // + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
  592. __m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
  593. __m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
  594. __m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
  595. __m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
  596. __m128 Add01 = _mm_add_ps(Sub01, Mul05);
  597. __m128 Inv1 = _mm_mul_ps(SignA, Add01);
  598. // col2
  599. // + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
  600. // - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
  601. // + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
  602. // - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
  603. __m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
  604. __m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
  605. __m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
  606. __m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
  607. __m128 Add02 = _mm_add_ps(Sub02, Mul08);
  608. __m128 Inv2 = _mm_mul_ps(SignB, Add02);
  609. // col3
  610. // - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
  611. // + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
  612. // - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
  613. // + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
  614. __m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
  615. __m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
  616. __m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
  617. __m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
  618. __m128 Add03 = _mm_add_ps(Sub03, Mul11);
  619. __m128 Inv3 = _mm_mul_ps(SignA, Add03);
  620. __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
  621. __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
  622. __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
  623. // valType Determinant = m[0][0] * Inverse[0][0]
  624. // + m[0][1] * Inverse[1][0]
  625. // + m[0][2] * Inverse[2][0]
  626. // + m[0][3] * Inverse[3][0];
  627. __m128 Det0 = sse_dot_ps(in[0], Row2);
  628. __m128 Rcp0 = _mm_div_ps(one, Det0);
  629. //__m128 Rcp0 = _mm_rcp_ps(Det0);
  630. // Inverse /= Determinant;
  631. out[0] = _mm_mul_ps(Inv0, Rcp0);
  632. out[1] = _mm_mul_ps(Inv1, Rcp0);
  633. out[2] = _mm_mul_ps(Inv2, Rcp0);
  634. out[3] = _mm_mul_ps(Inv3, Rcp0);
  635. }
  636. GLM_FUNC_QUALIFIER void sse_inverse_fast_ps(__m128 const in[4], __m128 out[4])
  637. {
  638. __m128 Fac0;
  639. {
  640. // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  641. // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  642. // valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
  643. // valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
  644. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  645. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  646. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  647. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  648. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  649. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  650. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  651. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  652. Fac0 = _mm_sub_ps(Mul00, Mul01);
  653. }
  654. __m128 Fac1;
  655. {
  656. // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  657. // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  658. // valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
  659. // valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
  660. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  661. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  662. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  663. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  664. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  665. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  666. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  667. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  668. Fac1 = _mm_sub_ps(Mul00, Mul01);
  669. }
  670. __m128 Fac2;
  671. {
  672. // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  673. // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  674. // valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
  675. // valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
  676. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  677. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  678. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  679. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  680. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  681. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  682. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  683. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  684. Fac2 = _mm_sub_ps(Mul00, Mul01);
  685. }
  686. __m128 Fac3;
  687. {
  688. // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  689. // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  690. // valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
  691. // valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
  692. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  693. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  694. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  695. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  696. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  697. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  698. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  699. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  700. Fac3 = _mm_sub_ps(Mul00, Mul01);
  701. }
  702. __m128 Fac4;
  703. {
  704. // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  705. // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  706. // valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
  707. // valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
  708. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  709. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  710. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  711. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  712. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  713. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  714. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  715. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  716. Fac4 = _mm_sub_ps(Mul00, Mul01);
  717. }
  718. __m128 Fac5;
  719. {
  720. // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  721. // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  722. // valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
  723. // valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
  724. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  725. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  726. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  727. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  728. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  729. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  730. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  731. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  732. Fac5 = _mm_sub_ps(Mul00, Mul01);
  733. }
  734. __m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
  735. __m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
  736. // m[1][0]
  737. // m[0][0]
  738. // m[0][0]
  739. // m[0][0]
  740. __m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
  741. __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
  742. // m[1][1]
  743. // m[0][1]
  744. // m[0][1]
  745. // m[0][1]
  746. __m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
  747. __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
  748. // m[1][2]
  749. // m[0][2]
  750. // m[0][2]
  751. // m[0][2]
  752. __m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
  753. __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
  754. // m[1][3]
  755. // m[0][3]
  756. // m[0][3]
  757. // m[0][3]
  758. __m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
  759. __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
  760. // col0
  761. // + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
  762. // - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
  763. // + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
  764. // - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
  765. __m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
  766. __m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
  767. __m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
  768. __m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
  769. __m128 Add00 = _mm_add_ps(Sub00, Mul02);
  770. __m128 Inv0 = _mm_mul_ps(SignB, Add00);
  771. // col1
  772. // - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
  773. // + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
  774. // - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
  775. // + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
  776. __m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
  777. __m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
  778. __m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
  779. __m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
  780. __m128 Add01 = _mm_add_ps(Sub01, Mul05);
  781. __m128 Inv1 = _mm_mul_ps(SignA, Add01);
  782. // col2
  783. // + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
  784. // - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
  785. // + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
  786. // - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
  787. __m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
  788. __m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
  789. __m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
  790. __m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
  791. __m128 Add02 = _mm_add_ps(Sub02, Mul08);
  792. __m128 Inv2 = _mm_mul_ps(SignB, Add02);
  793. // col3
  794. // - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
  795. // + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
  796. // - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
  797. // + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
  798. __m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
  799. __m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
  800. __m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
  801. __m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
  802. __m128 Add03 = _mm_add_ps(Sub03, Mul11);
  803. __m128 Inv3 = _mm_mul_ps(SignA, Add03);
  804. __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
  805. __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
  806. __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
  807. // valType Determinant = m[0][0] * Inverse[0][0]
  808. // + m[0][1] * Inverse[1][0]
  809. // + m[0][2] * Inverse[2][0]
  810. // + m[0][3] * Inverse[3][0];
  811. __m128 Det0 = sse_dot_ps(in[0], Row2);
  812. __m128 Rcp0 = _mm_rcp_ps(Det0);
  813. //__m128 Rcp0 = _mm_div_ps(one, Det0);
  814. // Inverse /= Determinant;
  815. out[0] = _mm_mul_ps(Inv0, Rcp0);
  816. out[1] = _mm_mul_ps(Inv1, Rcp0);
  817. out[2] = _mm_mul_ps(Inv2, Rcp0);
  818. out[3] = _mm_mul_ps(Inv3, Rcp0);
  819. }
  820. /*
  821. GLM_FUNC_QUALIFIER void sse_rotate_ps(__m128 const in[4], float Angle, float const v[3], __m128 out[4])
  822. {
  823. float a = glm::radians(Angle);
  824. float c = cos(a);
  825. float s = sin(a);
  826. glm::vec4 AxisA(v[0], v[1], v[2], float(0));
  827. __m128 AxisB = _mm_set_ps(AxisA.w, AxisA.z, AxisA.y, AxisA.x);
  828. __m128 AxisC = detail::sse_nrm_ps(AxisB);
  829. __m128 Cos0 = _mm_set_ss(c);
  830. __m128 CosA = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(0, 0, 0, 0));
  831. __m128 Sin0 = _mm_set_ss(s);
  832. __m128 SinA = _mm_shuffle_ps(Sin0, Sin0, _MM_SHUFFLE(0, 0, 0, 0));
  833. // detail::tvec3<valType> temp = (valType(1) - c) * axis;
  834. __m128 Temp0 = _mm_sub_ps(one, CosA);
  835. __m128 Temp1 = _mm_mul_ps(Temp0, AxisC);
  836. //Rotate[0][0] = c + temp[0] * axis[0];
  837. //Rotate[0][1] = 0 + temp[0] * axis[1] + s * axis[2];
  838. //Rotate[0][2] = 0 + temp[0] * axis[2] - s * axis[1];
  839. __m128 Axis0 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(0, 0, 0, 0));
  840. __m128 TmpA0 = _mm_mul_ps(Axis0, AxisC);
  841. __m128 CosA0 = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(1, 1, 1, 0));
  842. __m128 TmpA1 = _mm_add_ps(CosA0, TmpA0);
  843. __m128 SinA0 = SinA;//_mm_set_ps(0.0f, s, -s, 0.0f);
  844. __m128 TmpA2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(3, 1, 2, 3));
  845. __m128 TmpA3 = _mm_mul_ps(SinA0, TmpA2);
  846. __m128 TmpA4 = _mm_add_ps(TmpA1, TmpA3);
  847. //Rotate[1][0] = 0 + temp[1] * axis[0] - s * axis[2];
  848. //Rotate[1][1] = c + temp[1] * axis[1];
  849. //Rotate[1][2] = 0 + temp[1] * axis[2] + s * axis[0];
  850. __m128 Axis1 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(1, 1, 1, 1));
  851. __m128 TmpB0 = _mm_mul_ps(Axis1, AxisC);
  852. __m128 CosA1 = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(1, 1, 0, 1));
  853. __m128 TmpB1 = _mm_add_ps(CosA1, TmpB0);
  854. __m128 SinB0 = SinA;//_mm_set_ps(-s, 0.0f, s, 0.0f);
  855. __m128 TmpB2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(3, 0, 3, 2));
  856. __m128 TmpB3 = _mm_mul_ps(SinA0, TmpB2);
  857. __m128 TmpB4 = _mm_add_ps(TmpB1, TmpB3);
  858. //Rotate[2][0] = 0 + temp[2] * axis[0] + s * axis[1];
  859. //Rotate[2][1] = 0 + temp[2] * axis[1] - s * axis[0];
  860. //Rotate[2][2] = c + temp[2] * axis[2];
  861. __m128 Axis2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(2, 2, 2, 2));
  862. __m128 TmpC0 = _mm_mul_ps(Axis2, AxisC);
  863. __m128 CosA2 = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(1, 0, 1, 1));
  864. __m128 TmpC1 = _mm_add_ps(CosA2, TmpC0);
  865. __m128 SinC0 = SinA;//_mm_set_ps(s, -s, 0.0f, 0.0f);
  866. __m128 TmpC2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(3, 3, 0, 1));
  867. __m128 TmpC3 = _mm_mul_ps(SinA0, TmpC2);
  868. __m128 TmpC4 = _mm_add_ps(TmpC1, TmpC3);
  869. __m128 Result[4];
  870. Result[0] = TmpA4;
  871. Result[1] = TmpB4;
  872. Result[2] = TmpC4;
  873. Result[3] = _mm_set_ps(1, 0, 0, 0);
  874. //detail::tmat4x4<valType> Result(detail::tmat4x4<valType>::null);
  875. //Result[0] = m[0] * Rotate[0][0] + m[1] * Rotate[0][1] + m[2] * Rotate[0][2];
  876. //Result[1] = m[0] * Rotate[1][0] + m[1] * Rotate[1][1] + m[2] * Rotate[1][2];
  877. //Result[2] = m[0] * Rotate[2][0] + m[1] * Rotate[2][1] + m[2] * Rotate[2][2];
  878. //Result[3] = m[3];
  879. //return Result;
  880. sse_mul_ps(in, Result, out);
  881. }
  882. */
  883. GLM_FUNC_QUALIFIER void sse_outer_ps(__m128 const & c, __m128 const & r, __m128 out[4])
  884. {
  885. out[0] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(0, 0, 0, 0)));
  886. out[1] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(1, 1, 1, 1)));
  887. out[2] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(2, 2, 2, 2)));
  888. out[3] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(3, 3, 3, 3)));
  889. }
  890. }//namespace detail
  891. }//namespace glm