Skip to content

Commit bd8f99b

Browse files
committed
Improve FMA usage in laqr5
Rearrange the application of the Householder reflector to save one instruction per dot product if FMA is available. The update from the right, H * (I - tau * v * v**T), for example, changes from H - (tau * (H * v)) * v**T to H - (H * (v * tau)) * v**T. The instruction savings are due to the special structure of v, whose first component is implicitly one (and used for storing tau).
1 parent f40d220 commit bd8f99b

File tree

4 files changed

+223
-174
lines changed

4 files changed

+223
-174
lines changed

SRC/claqr5.f

Lines changed: 52 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
279279
PARAMETER ( RZERO = 0.0e0, RONE = 1.0e0 )
280280
* ..
281281
* .. Local Scalars ..
282-
COMPLEX ALPHA, BETA, CDUM, REFSUM
282+
COMPLEX ALPHA, BETA, CDUM, REFSUM, T1, T2, T3
283283
REAL H11, H12, H21, H22, SAFMAX, SAFMIN, SCL,
284284
$ SMLNUM, TST1, TST2, ULP
285285
INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN,
@@ -424,12 +424,12 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
424424
* ==== Perform update from right within
425425
* . computational window. ====
426426
*
427+
T1 = V( 1, M22 )
428+
T2 = T1*CONJG( V( 2, M22 ) )
427429
DO 30 J = JTOP, MIN( KBOT, K+3 )
428-
REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
429-
$ H( J, K+2 ) )
430-
H( J, K+1 ) = H( J, K+1 ) - REFSUM
431-
H( J, K+2 ) = H( J, K+2 ) -
432-
$ REFSUM*CONJG( V( 2, M22 ) )
430+
REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 )
431+
H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
432+
H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
433433
30 CONTINUE
434434
*
435435
* ==== Perform update from left within
@@ -442,12 +442,13 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
442442
ELSE
443443
JBOT = KBOT
444444
END IF
445+
T1 = CONJG( V( 1, M22 ) )
446+
T2 = T1*V( 2, M22 )
445447
DO 40 J = K+1, JBOT
446-
REFSUM = CONJG( V( 1, M22 ) )*
447-
$ ( H( K+1, J )+CONJG( V( 2, M22 ) )*
448-
$ H( K+2, J ) )
449-
H( K+1, J ) = H( K+1, J ) - REFSUM
450-
H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
448+
REFSUM = H( K+1, J ) +
449+
$ CONJG( V( 2, M22 ) )*H( K+2, J )
450+
H( K+1, J ) = H( K+1, J ) - REFSUM*T1
451+
H( K+2, J ) = H( K+2, J ) - REFSUM*T2
451452
40 CONTINUE
452453
*
453454
* ==== The following convergence test requires that
@@ -610,25 +611,28 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
610611
* . deflation check. We still delay most of the
611612
* . updates from the left for efficiency. ====
612613
*
614+
T1 = V( 1, M )
615+
T2 = T1*CONJG( V( 2, M ) )
616+
T3 = T1*CONJG( V( 3, M ) )
613617
DO 70 J = JTOP, MIN( KBOT, K+3 )
614-
REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
615-
$ H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
616-
H( J, K+1 ) = H( J, K+1 ) - REFSUM
617-
H( J, K+2 ) = H( J, K+2 ) -
618-
$ REFSUM*CONJG( V( 2, M ) )
619-
H( J, K+3 ) = H( J, K+3 ) -
620-
$ REFSUM*CONJG( V( 3, M ) )
618+
REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 )
619+
$ + V( 3, M )*H( J, K+3 )
620+
H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
621+
H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
622+
H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3
621623
70 CONTINUE
622624
*
623625
* ==== Perform update from left for subsequent
624626
* . column. ====
625627
*
626-
REFSUM = CONJG( V( 1, M ) )*( H( K+1, K+1 )
627-
$ +CONJG( V( 2, M ) )*H( K+2, K+1 )
628-
$ +CONJG( V( 3, M ) )*H( K+3, K+1 ) )
629-
H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
630-
H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
631-
H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
628+
T1 = CONJG( V( 1, M ) )
629+
T2 = T1*V( 2, M )
630+
T3 = T1*V( 3, M )
631+
REFSUM = H( K+1, K+1 ) + CONJG( V( 2, M ) )*H( K+2, K+1 )
632+
$ + CONJG( V( 3, M ) )*H( K+3, K+1 )
633+
H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1
634+
H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2
635+
H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3
632636
*
633637
* ==== The following convergence test requires that
634638
* . the tradition small-compared-to-nearby-diagonals
@@ -688,13 +692,15 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
688692
*
689693
DO 100 M = MBOT, MTOP, -1
690694
K = KRCOL + 2*( M-1 )
695+
T1 = CONJG( V( 1, M ) )
696+
T2 = T1*V( 2, M )
697+
T3 = T1*V( 3, M )
691698
DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
692-
REFSUM = CONJG( V( 1, M ) )*
693-
$ ( H( K+1, J )+CONJG( V( 2, M ) )*
694-
$ H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) )
695-
H( K+1, J ) = H( K+1, J ) - REFSUM
696-
H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
697-
H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
699+
REFSUM = H( K+1, J ) + CONJG( V( 2, M ) )*
700+
$ H( K+2, J ) + CONJG( V( 3, M ) )*H( K+3, J )
701+
H( K+1, J ) = H( K+1, J ) - REFSUM*T1
702+
H( K+2, J ) = H( K+2, J ) - REFSUM*T2
703+
H( K+3, J ) = H( K+3, J ) - REFSUM*T3
698704
90 CONTINUE
699705
100 CONTINUE
700706
*
@@ -712,14 +718,15 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
712718
I2 = MAX( 1, KTOP-INCOL )
713719
I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
714720
I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
721+
T1 = V( 1, M )
722+
T2 = T1*CONJG( V( 2, M ) )
723+
T3 = T1*CONJG( V( 3, M ) )
715724
DO 110 J = I2, I4
716-
REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
717-
$ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
718-
U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
719-
U( J, KMS+2 ) = U( J, KMS+2 ) -
720-
$ REFSUM*CONJG( V( 2, M ) )
721-
U( J, KMS+3 ) = U( J, KMS+3 ) -
722-
$ REFSUM*CONJG( V( 3, M ) )
725+
REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 )
726+
$ + V( 3, M )*U( J, KMS+3 )
727+
U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1
728+
U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2
729+
U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3
723730
110 CONTINUE
724731
120 CONTINUE
725732
ELSE IF( WANTZ ) THEN
@@ -730,14 +737,15 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
730737
*
731738
DO 140 M = MBOT, MTOP, -1
732739
K = KRCOL + 2*( M-1 )
740+
T1 = V( 1, M )
741+
T2 = T1*CONJG( V( 2, M ) )
742+
T3 = T1*CONJG( V( 3, M ) )
733743
DO 130 J = ILOZ, IHIZ
734-
REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
735-
$ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
736-
Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
737-
Z( J, K+2 ) = Z( J, K+2 ) -
738-
$ REFSUM*CONJG( V( 2, M ) )
739-
Z( J, K+3 ) = Z( J, K+3 ) -
740-
$ REFSUM*CONJG( V( 3, M ) )
744+
REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 )
745+
$ + V( 3, M )*Z( J, K+3 )
746+
Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1
747+
Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2
748+
Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3
741749
130 CONTINUE
742750
140 CONTINUE
743751
END IF

SRC/dlaqr5.f

Lines changed: 59 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -286,8 +286,8 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
286286
* ..
287287
* .. Local Scalars ..
288288
DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM,
289-
$ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2,
290-
$ ULP
289+
$ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, T1, T2,
290+
$ T3, TST1, TST2, ULP
291291
INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN,
292292
$ JROW, JTOP, K, K1, KDU, KMS, KRCOL,
293293
$ M, M22, MBOT, MTOP, NBMPS, NDCOL,
@@ -447,11 +447,12 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
447447
* ==== Perform update from right within
448448
* . computational window. ====
449449
*
450+
T1 = V( 1, M22 )
451+
T2 = T1*V( 2, M22 )
450452
DO 30 J = JTOP, MIN( KBOT, K+3 )
451-
REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
452-
$ H( J, K+2 ) )
453-
H( J, K+1 ) = H( J, K+1 ) - REFSUM
454-
H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
453+
REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 )
454+
H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
455+
H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
455456
30 CONTINUE
456457
*
457458
* ==== Perform update from left within
@@ -464,11 +465,12 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
464465
ELSE
465466
JBOT = KBOT
466467
END IF
468+
T1 = V( 1, M22 )
469+
T2 = T1*V( 2, M22 )
467470
DO 40 J = K+1, JBOT
468-
REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
469-
$ H( K+2, J ) )
470-
H( K+1, J ) = H( K+1, J ) - REFSUM
471-
H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
471+
REFSUM = H( K+1, J ) + V( 2, M22 )*H( K+2, J )
472+
H( K+1, J ) = H( K+1, J ) - REFSUM*T1
473+
H( K+2, J ) = H( K+2, J ) - REFSUM*T2
472474
40 CONTINUE
473475
*
474476
* ==== The following convergence test requires that
@@ -522,18 +524,20 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
522524
*
523525
IF( ACCUM ) THEN
524526
KMS = K - INCOL
527+
T1 = V( 1, M22 )
528+
T2 = T1*V( 2, M22 )
525529
DO 50 J = MAX( 1, KTOP-INCOL ), KDU
526-
REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
527-
$ V( 2, M22 )*U( J, KMS+2 ) )
528-
U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
529-
U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 )
530+
REFSUM = U( J, KMS+1 ) + V( 2, M22 )*U( J, KMS+2 )
531+
U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1
532+
U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2
530533
50 CONTINUE
531534
ELSE IF( WANTZ ) THEN
535+
T1 = V( 1, M22 )
536+
T2 = T1*V( 2, M22 )
532537
DO 60 J = ILOZ, IHIZ
533-
REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
534-
$ Z( J, K+2 ) )
535-
Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
536-
Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
538+
REFSUM = Z( J, K+1 )+V( 2, M22 )*Z( J, K+2 )
539+
Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1
540+
Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2
537541
60 CONTINUE
538542
END IF
539543
END IF
@@ -631,22 +635,25 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
631635
* . deflation check. We still delay most of the
632636
* . updates from the left for efficiency. ====
633637
*
638+
T1 = V( 1, M )
639+
T2 = T1*V( 2, M )
640+
T3 = T1*V( 3, M )
634641
DO 70 J = JTOP, MIN( KBOT, K+3 )
635-
REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
636-
$ H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
637-
H( J, K+1 ) = H( J, K+1 ) - REFSUM
638-
H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M )
639-
H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M )
642+
REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 )
643+
$ + V( 3, M )*H( J, K+3 )
644+
H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
645+
H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
646+
H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3
640647
70 CONTINUE
641648
*
642649
* ==== Perform update from left for subsequent
643650
* . column. ====
644651
*
645-
REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )*
646-
$ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) )
647-
H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
648-
H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
649-
H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
652+
REFSUM = H( K+1, K+1 ) + V( 2, M )*H( K+2, K+1 )
653+
$ + V( 3, M )*H( K+3, K+1 )
654+
H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1
655+
H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2
656+
H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3
650657
*
651658
* ==== The following convergence test requires that
652659
* . the tradition small-compared-to-nearby-diagonals
@@ -706,12 +713,15 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
706713
*
707714
DO 100 M = MBOT, MTOP, -1
708715
K = KRCOL + 2*( M-1 )
716+
T1 = V( 1, M )
717+
T2 = T1*V( 2, M )
718+
T3 = T1*V( 3, M )
709719
DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
710-
REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )*
711-
$ H( K+2, J )+V( 3, M )*H( K+3, J ) )
712-
H( K+1, J ) = H( K+1, J ) - REFSUM
713-
H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
714-
H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
720+
REFSUM = H( K+1, J ) + V( 2, M )*H( K+2, J )
721+
$ + V( 3, M )*H( K+3, J )
722+
H( K+1, J ) = H( K+1, J ) - REFSUM*T1
723+
H( K+2, J ) = H( K+2, J ) - REFSUM*T2
724+
H( K+3, J ) = H( K+3, J ) - REFSUM*T3
715725
90 CONTINUE
716726
100 CONTINUE
717727
*
@@ -729,12 +739,15 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
729739
I2 = MAX( 1, KTOP-INCOL )
730740
I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
731741
I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
742+
T1 = V( 1, M )
743+
T2 = T1*V( 2, M )
744+
T3 = T1*V( 3, M )
732745
DO 110 J = I2, I4
733-
REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
734-
$ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
735-
U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
736-
U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M )
737-
U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M )
746+
REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 )
747+
$ + V( 3, M )*U( J, KMS+3 )
748+
U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1
749+
U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2
750+
U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3
738751
110 CONTINUE
739752
120 CONTINUE
740753
ELSE IF( WANTZ ) THEN
@@ -745,12 +758,15 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
745758
*
746759
DO 140 M = MBOT, MTOP, -1
747760
K = KRCOL + 2*( M-1 )
761+
T1 = V( 1, M )
762+
T2 = T1*V( 2, M )
763+
T3 = T1*V( 3, M )
748764
DO 130 J = ILOZ, IHIZ
749-
REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
750-
$ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
751-
Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
752-
Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M )
753-
Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M )
765+
REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 )
766+
$ + V( 3, M )*Z( J, K+3 )
767+
Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1
768+
Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2
769+
Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3
754770
130 CONTINUE
755771
140 CONTINUE
756772
END IF

0 commit comments

Comments
 (0)