xref: /petsc/include/petsc/private/kernels/blockinvert.h (revision a02648fdf9ec0d41d7b5ca02cb70ddcfa0e65728)
1 /*
2     Kernels used in sparse ILU (and LU) and in the resulting triangular
3  solves. These are for block algorithms where the block sizes are on
4  the order of 2-6+.
5 
6     There are TWO versions of the macros below.
7     1) standard for MatScalar == PetscScalar use the standard BLAS for
8        block size larger than 7 and
9     2) handcoded Fortran single precision for the matrices, since BLAS
10        does not have some arguments in single and some in double.
11 
12 */
13 #pragma once
14 #include <petscblaslapack.h>
15 
16 /*
17       These are C kernels,they are contained in
18    src/mat/impls/baij/seq
19 */
20 
21 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode PetscLINPACKgefa(MatScalar *, PetscInt, PetscInt *, PetscBool, PetscBool *);
22 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode PetscLINPACKgedi(MatScalar *, PetscInt, PetscInt *, MatScalar *);
23 
24 PETSC_EXTERN PetscErrorCode                PetscKernel_A_gets_inverse_A_2(MatScalar *, PetscReal, PetscBool, PetscBool *);
25 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode PetscKernel_A_gets_inverse_A_3(MatScalar *, PetscReal, PetscBool, PetscBool *);
26 
27 #define PetscKernel_A_gets_inverse_A_4_nopivot(mat) \
28   PetscMacroReturnStandard(MatScalar d, di = mat[0]; mat[0] = d = 1.0 / di; mat[4] *= -d; mat[8] *= -d; mat[12] *= -d; mat[1] *= d; mat[2] *= d; mat[3] *= d; mat[5] += mat[4] * mat[1] * di; mat[6] += mat[4] * mat[2] * di; mat[7] += mat[4] * mat[3] * di; mat[9] += mat[8] * mat[1] * di; mat[10] += mat[8] * mat[2] * di; mat[11] += mat[8] * mat[3] * di; mat[13] += mat[12] * mat[1] * di; mat[14] += mat[12] * mat[2] * di; mat[15] += mat[12] * mat[3] * di; di = mat[5]; mat[5] = d = 1.0 / di; mat[1] *= -d; mat[9] *= -d; mat[13] *= -d; mat[4] *= d; mat[6] *= d; mat[7] *= d; mat[0] += mat[1] * mat[4] * di; mat[2] += mat[1] * mat[6] * di; mat[3] += mat[1] * mat[7] * di; mat[8] += mat[9] * mat[4] * di; mat[10] += mat[9] * mat[6] * di; mat[11] += mat[9] * mat[7] * di; mat[12] += mat[13] * mat[4] * di; mat[14] += mat[13] * mat[6] * di; mat[15] += mat[13] * mat[7] * di; di = mat[10]; mat[10] = d = 1.0 / di; mat[2] *= -d; mat[6] *= -d; mat[14] *= -d; mat[8] *= d; mat[9] *= d; mat[11] *= d; mat[0] += mat[2] * mat[8] * di; mat[1] += mat[2] * mat[9] * di; mat[3] += mat[2] * mat[11] * di; mat[4] += mat[6] * mat[8] * di; mat[5] += mat[6] * mat[9] * di; mat[7] += mat[6] * mat[11] * di; mat[12] += mat[14] * mat[8] * di; mat[13] += mat[14] * mat[9] * di; mat[15] += mat[14] * mat[11] * di; di = mat[15]; mat[15] = d = 1.0 / di; mat[3] *= -d; mat[7] *= -d; mat[11] *= -d; mat[12] *= d; mat[13] *= d; mat[14] *= d; mat[0] += mat[3] * mat[12] * di; mat[1] += mat[3] * mat[13] * di; mat[2] += mat[3] * mat[14] * di; mat[4] += mat[7] * mat[12] * di; mat[5] += mat[7] * mat[13] * di; mat[6] += mat[7] * mat[14] * di; mat[8] += mat[11] * mat[12] * di; mat[9] += mat[11] * mat[13] * di; mat[10] += mat[11] * mat[14] * di;)
29 
30 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar *, PetscReal, PetscBool, PetscBool *);
31 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode PetscKernel_A_gets_inverse_A_5(MatScalar *, PetscInt *, MatScalar *, PetscReal, PetscBool, PetscBool *);
32 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode PetscKernel_A_gets_inverse_A_6(MatScalar *, PetscReal, PetscBool, PetscBool *);
33 PETSC_SINGLE_LIBRARY_INTERN PetscErrorCode PetscKernel_A_gets_inverse_A_7(MatScalar *, PetscReal, PetscBool, PetscBool *);
34 PETSC_INTERN PetscErrorCode                PetscKernel_A_gets_inverse_A_9(MatScalar *, PetscReal, PetscBool, PetscBool *);
35 PETSC_INTERN PetscErrorCode                PetscKernel_A_gets_inverse_A_15(MatScalar *, PetscInt *, MatScalar *, PetscReal, PetscBool, PetscBool *);
36 
37 /*
38     A = inv(A)    A_gets_inverse_A
39 
40    A      - square bs by bs array stored in column major order
41    pivots - integer work array of length bs
42    W      -  bs by 1 work array
43 */
44 #define PetscKernel_A_gets_inverse_A(bs, A, pivots, W, allowzeropivot, zeropivotdetected) ((PetscErrorCode)(PetscLINPACKgefa((A), (bs), (pivots), (allowzeropivot), (zeropivotdetected)) || PetscLINPACKgedi((A), (bs), (pivots), (W))))
45 
46 /* -----------------------------------------------------------------------*/
47 
48 #if !defined(PETSC_USE_REAL_MAT_SINGLE)
49   /*
50         Version that calls the BLAS directly
51 */
52   /*
53       A = A * B   A_gets_A_times_B
54 
55    A, B - square bs by bs arrays stored in column major order
56    W    - square bs by bs work array
57 
58 */
59   #define PetscKernel_A_gets_A_times_B(bs, A, B, W) \
60     do { \
61       PetscBLASInt _bbs; \
62       PetscScalar  _one = 1.0, _zero = 0.0; \
63       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
64       PetscCall(PetscArraycpy((W), (A), (bs) * (bs))); \
65       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 2)); \
66       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(B, 3)); \
67       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(W, 4)); \
68       PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &(_bbs), &(_bbs), &(_bbs), &_one, (W), &(_bbs), (B), &(_bbs), &_zero, (A), &(_bbs))); \
69     } while (0)
70 
71   /*
72 
73     A = A - B * C  A_gets_A_minus_B_times_C
74 
75    A, B, C - square bs by bs arrays stored in column major order
76 */
77   #define PetscKernel_A_gets_A_minus_B_times_C(bs, A, B, C) \
78     do { \
79       PetscScalar  _mone = -1.0, _one = 1.0; \
80       PetscBLASInt _bbs; \
81       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
82       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 2)); \
83       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(B, 3)); \
84       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(C, 4)); \
85       PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &(_bbs), &(_bbs), &(_bbs), &_mone, (B), &(_bbs), (C), &(_bbs), &_one, (A), &(_bbs))); \
86     } while (0)
87 
88   /*
89 
90     A = A + B^T * C  A_gets_A_plus_Btranspose_times_C
91 
92    A, B, C - square bs by bs arrays stored in column major order
93 */
94   #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, A, B, C) \
95     do { \
96       PetscScalar  _one = 1.0; \
97       PetscBLASInt _bbs; \
98       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
99       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 2)); \
100       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(B, 3)); \
101       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(C, 4)); \
102       PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &(_bbs), &(_bbs), &(_bbs), &_one, (B), &(_bbs), (C), &(_bbs), &_one, (A), &(_bbs))); \
103     } while (0)
104 
105   /*
106     v = v + A^T w  v_gets_v_plus_Atranspose_times_w
107 
108    v - array of length bs
109    A - square bs by bs array
110    w - array of length bs
111 */
112   #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, v, A, w) \
113     do { \
114       PetscScalar  _one  = 1.0; \
115       PetscBLASInt _ione = 1, _bbs; \
116       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
117       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(v, 2)); \
118       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 3)); \
119       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(w, 4)); \
120       PetscCallBLAS("BLASgemv", BLASgemv_("T", &(_bbs), &(_bbs), &_one, A, &(_bbs), w, &_ione, &_one, v, &_ione)); \
121     } while (0)
122 
123   /*
124     v = v - A w  v_gets_v_minus_A_times_w
125 
126    v - array of length bs
127    A - square bs by bs array
128    w - array of length bs
129 */
130   #define PetscKernel_v_gets_v_minus_A_times_w(bs, v, A, w) \
131     do { \
132       PetscScalar  _mone = -1.0, _one = 1.0; \
133       PetscBLASInt _ione = 1, _bbs; \
134       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
135       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(v, 2)); \
136       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 3)); \
137       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(w, 4)); \
138       PetscCallBLAS("BLASgemv", BLASgemv_("N", &(_bbs), &(_bbs), &_mone, A, &(_bbs), w, &_ione, &_one, v, &_ione)); \
139     } while (0)
140 
141   /*
142     v = v - A w  v_gets_v_minus_transA_times_w
143 
144    v - array of length bs
145    A - square bs by bs array
146    w - array of length bs
147 */
148   #define PetscKernel_v_gets_v_minus_transA_times_w(bs, v, A, w) \
149     do { \
150       PetscScalar  _mone = -1.0, _one = 1.0; \
151       PetscBLASInt _ione = 1, _bbs; \
152       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
153       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(v, 2)); \
154       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 3)); \
155       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(w, 4)); \
156       PetscCallBLAS("BLASgemv", BLASgemv_("T", &(_bbs), &(_bbs), &_mone, A, &(_bbs), w, &_ione, &_one, v, &_ione)); \
157     } while (0)
158 
159   /*
160     v = v + A w  v_gets_v_plus_A_times_w
161 
162    v - array of length bs
163    A - square bs by bs array
164    w - array of length bs
165 */
166   #define PetscKernel_v_gets_v_plus_A_times_w(bs, v, A, w) \
167     do { \
168       PetscScalar  _one  = 1.0; \
169       PetscBLASInt _ione = 1, _bbs; \
170       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
171       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(v, 2)); \
172       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 3)); \
173       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(w, 4)); \
174       PetscCallBLAS("BLASgemv", BLASgemv_("N", &(_bbs), &(_bbs), &_one, A, &(_bbs), w, &_ione, &_one, v, &_ione)); \
175     } while (0)
176 
177   /*
178     v = v + A w  v_gets_v_plus_Ar_times_w
179 
180    v - array of length bs
181    A - square bs by bs array
182    w - array of length bs
183 */
184   #define PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, v, A, w) \
185     do { \
186       PetscScalar  _one  = 1.0; \
187       PetscBLASInt _ione = 1, _bbs, _bncols; \
188       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
189       PetscCall(PetscBLASIntCast(ncols, &_bncols)); \
190       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(v, 3)); \
191       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 4)); \
192       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(w, 5)); \
193       PetscCallBLAS("BLASgemv", BLASgemv_("N", &(_bbs), &(_bncols), &_one, A, &(_bbs), v, &_ione, &_one, w, &_ione)); \
194     } while (0)
195 
196   /*
197     w <- w - A v  w_gets_w_minus_Ar_times_v
198 
199    v - array of length ncol
200    A(bs,ncols)
201    w - array of length bs
202 */
203   #define PetscKernel_w_gets_w_minus_Ar_times_v(bs, ncols, w, A, v) \
204     do { \
205       PetscScalar  _one = 1.0, _mone = -1.0; \
206       PetscBLASInt _ione = 1, _bbs, _bncols; \
207       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
208       PetscCall(PetscBLASIntCast(ncols, &_bncols)); \
209       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(w, 3)); \
210       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 4)); \
211       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(v, 5)); \
212       PetscCallBLAS("BLASgemv", BLASgemv_("N", &(_bbs), &(_bncols), &_mone, A, &(_bbs), v, &_ione, &_one, w, &_ione)); \
213     } while (0)
214 
215   /*
216     w = A v   w_gets_A_times_v
217 
218    v - array of length bs
219    A - square bs by bs array
220    w - array of length bs
221  */
222   #define PetscKernel_w_gets_A_times_v(bs, v, A, w) \
223     do { \
224       PetscScalar  _zero = 0.0, _one = 1.0; \
225       PetscBLASInt _ione = 1, _bbs; \
226       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
227       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(v, 2)); \
228       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 3)); \
229       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(w, 4)); \
230       PetscCallBLAS("BLASgemv", BLASgemv_("N", &(_bbs), &(_bbs), &_one, A, &(_bbs), v, &_ione, &_zero, w, &_ione)); \
231     } while (0)
232 
233   /*
234     w = A' v   w_gets_transA_times_v
235 
236    v - array of length bs
237    A - square bs by bs array
238    w - array of length bs
239 */
240   #define PetscKernel_w_gets_transA_times_v(bs, v, A, w) \
241     do { \
242       PetscScalar  _zero = 0.0, _one = 1.0; \
243       PetscBLASInt _ione = 1, _bbs; \
244       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
245       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(v, 2)); \
246       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 3)); \
247       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(w, 4)); \
248       PetscCallBLAS("BLASgemv", BLASgemv_("T", &(_bbs), &(_bbs), &_one, A, &(_bbs), v, &_ione, &_zero, w, &_ione)); \
249     } while (0)
250 
251   /*
252         z = A*x
253 */
254   #define PetscKernel_w_gets_Ar_times_v(bs, ncols, x, A, z) \
255     do { \
256       PetscScalar  _one = 1.0, _zero = 0.0; \
257       PetscBLASInt _ione = 1, _bbs, _bncols; \
258       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
259       PetscCall(PetscBLASIntCast(ncols, &_bncols)); \
260       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(x, 3)); \
261       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 4)); \
262       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(z, 5)); \
263       PetscCallBLAS("BLASgemv", BLASgemv_("N", &(_bbs), &_bncols, &_one, A, &(_bbs), x, &_ione, &_zero, z, &_ione)); \
264     } while (0)
265 
266   /*
267         z = trans(A)*x
268 */
269   #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, A, z) \
270     do { \
271       PetscScalar  _one  = 1.0; \
272       PetscBLASInt _ione = 1, _bbs, _bncols; \
273       PetscCall(PetscBLASIntCast(bs, &_bbs)); \
274       PetscCall(PetscBLASIntCast(ncols, &_bncols)); \
275       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(x, 3)); \
276       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(A, 4)); \
277       PetscDisableStaticAnalyzerForExpressionUnderstandingThatThisIsDangerousAndBugprone(PetscAssertPointer(z, 5)); \
278       PetscCallBLAS("BLASgemv", BLASgemv_("T", &_bbs, &_bncols, &_one, A, &_bbs, x, &_ione, &_one, z, &_ione)); \
279     } while (0)
280 
281 #else /* !defined(PETSC_USE_REAL_MAT_SINGLE) */
282 /*
283        Version that calls Fortran routines; can handle different precision
284    of matrix (array) and vectors
285 */
286 /*
287      These are Fortran kernels: They replace certain BLAS routines but
288    have some arguments that may be single precision,rather than double
289    These routines are provided in src/fortran/kernels/sgemv.F
290    They are pretty pitiful but get the job done. The intention is
291    that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
292    code is used.
293 */
294 
295   #if defined(PETSC_HAVE_FORTRAN_CAPS)
296     #define msgemv_  MSGEMV
297     #define msgemvp_ MSGEMVP
298     #define msgemvm_ MSGEMVM
299     #define msgemvt_ MSGEMVT
300     #define msgemmi_ MSGEMMI
301     #define msgemm_  MSGEMM
302   #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
303     #define msgemv_  msgemv
304     #define msgemvp_ msgemvp
305     #define msgemvm_ msgemvm
306     #define msgemvt_ msgemvt
307     #define msgemmi_ msgemmi
308     #define msgemm_  msgemm
309   #endif
310 
311 PETSC_INTERN void msgemv_(PetscInt *, PetscInt *, MatScalar *, PetscScalar *, PetscScalar *);
312 PETSC_INTERN void msgemvp_(PetscInt *, PetscInt *, MatScalar *, PetscScalar *, PetscScalar *);
313 PETSC_INTERN void msgemvm_(PetscInt *, PetscInt *, MatScalar *, PetscScalar *, PetscScalar *);
314 PETSC_INTERN void msgemvt_(PetscInt *, PetscInt *, MatScalar *, PetscScalar *, PetscScalar *);
315 PETSC_INTERN void msgemmi_(PetscInt *, MatScalar *, MatScalar *, MatScalar *);
316 PETSC_INTERN void msgemm_(PetscInt *, MatScalar *, MatScalar *, MatScalar *);
317 
318   /*
319       A = A * B   A_gets_A_times_B
320 
321    A, B - square bs by bs arrays stored in column major order
322    W    - square bs by bs work array
323 
324 */
325   #define PetscKernel_A_gets_A_times_B(bs, A, B, W) \
326     do { \
327       PetscCall(PetscArraycpy((W), (A), (bs) * (bs))); \
328       msgemmi_(&bs, A, B, W); \
329     } while (0)
330 
331   /*
332 
333     A = A - B * C  A_gets_A_minus_B_times_C
334 
335    A, B, C - square bs by bs arrays stored in column major order
336 */
337   #define PetscKernel_A_gets_A_minus_B_times_C(bs, A, B, C)              msgemm_(&(bs), (A), (B), (C))
338 
339   /*
340     v = v - A w  v_gets_v_minus_A_times_w
341 
342    v - array of length bs
343    A - square bs by bs array
344    w - array of length bs
345 */
346   #define PetscKernel_v_gets_v_minus_A_times_w(bs, v, A, w)              msgemvm_(&(bs), &(bs), (A), (w), (v))
347 
348   /*
349     v = v + A w  v_gets_v_plus_A_times_w
350 
351    v - array of length bs
352    A - square bs by bs array
353    w - array of length bs
354 */
355   #define PetscKernel_v_gets_v_plus_A_times_w(bs, v, A, w)               msgemvp_(&(bs), &(bs), (A), (w), (v))
356 
357   /*
358     v = v + A w  v_gets_v_plus_Ar_times_w
359 
360    v - array of length bs
361    A - square bs by bs array
362    w - array of length bs
363 */
364   #define PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncol, v, A, w)        msgemvp_(&(bs), &(ncol), (A), (v), (w))
365 
366   /*
367     w = A v   w_gets_A_times_v
368 
369    v - array of length bs
370    A - square bs by bs array
371    w - array of length bs
372 */
373   #define PetscKernel_w_gets_A_times_v(bs, v, A, w)                      msgemv_(&(bs), &(bs), (A), (v), (w))
374 
375   /*
376         z = A*x
377 */
378   #define PetscKernel_w_gets_Ar_times_v(bs, ncols, x, A, z)              msgemv_(&(bs), &(ncols), (A), (x), (z))
379 
380   /*
381         z = trans(A)*x
382 */
383   #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, A, z) msgemvt_(&(bs), &(ncols), (A), (x), (z))
384 
385   /* These do not work yet */
386   #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs, A, B, C)
387   #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, v, A, w)
388 
389 #endif /* !defined(PETSC_USE_REAL_MAT_SINGLE) */
390