Skip to content

Applications integrated with FTI

Karol Sierociński edited this page Oct 23, 2017 · 13 revisions

List of applications integrated with FTI

CoMD
CoSP2
LULESH

CoMD

⬆️ Top
Classical molecular dynamics proxy application.

https://github.com/exmatex/CoMD

File changes

Integrating FTI in CoMD took only addition of ~30 lines of code in 2 files. All occurrences of MPI_COMM_WORLD changed to FTI_COMM_WORLD except FTI_Init("config.fti", MPI_COMM_WORLD);

File: src-mpi/CoMD.c
102:    int i = 1;
103:    FTI_Protect(i++, sim->boxes->nAtoms, sim->boxes->nTotalBoxes, FTI_INTG);
104: 
105:    FTIT_type RealTInfo;
106:    FTI_InitType(&RealTInfo, sizeof(real_t));
107:    FTIT_type Real3Info;
108:    FTI_InitType(&Real3Info, sizeof(real3));
109:    int maxTotalAtoms = MAXATOMS * (sim->boxes->nTotalBoxes);
110: 
111:    FTI_Protect(i++, sim->atoms->gid, maxTotalAtoms, FTI_INTG);
112:    FTI_Protect(i++, sim->atoms->iSpecies, maxTotalAtoms, FTI_INTG);
113:    FTI_Protect(i++, sim->atoms->r, maxTotalAtoms, Real3Info);
114:    FTI_Protect(i++, sim->atoms->p, maxTotalAtoms, Real3Info);
115:    FTI_Protect(i++, sim->atoms->f, maxTotalAtoms, Real3Info);
116: 
117:    int iStep = 0;
118:    FTI_Protect(i++, &iStep, 1, FTI_INTG);
119: 
120:    if (FTI_Status() != 0) {
121:        int res = FTI_Recover();
122:        if (res != 0) {
123:            printf("\tRecovery failed! FTI_Recover returned %d.\n", res);
124:        }
125:    }
---
139:    profileStart(loopTimer);
140:    for (; iStep<nSteps;)
141:    {
142:       startTimer(commReduceTimer);
143:       sumAtoms(sim);
144:       stopTimer(commReduceTimer);
145: 
146:       printThings(sim, iStep, getElapsedTime(timestepTimer));
147: 
148:       startTimer(timestepTimer);
149:       timestep(sim, printRate, sim->dt);
150:       stopTimer(timestepTimer);
151: 
152:       iStep += printRate;
153:       int res = FTI_Checkpoint(iStep, 1);
154:       if (res != FTI_DONE) {
155:             printf("\tCheckpoint failed! FTI_Checkpoint returned %d.\n", res);
156:       }
157:    }
158:    profileStop(loopTimer);
File: src-mpi/parallel.c
64: void initParallel(int* argc, char*** argv)
65: {
66: #ifdef DO_MPI
67:    MPI_Init(argc, argv);
68:    FTI_Init("config.fti", MPI_COMM_WORLD);
69:    MPI_Comm_rank(FTI_COMM_WORLD, &myRank);
70:    MPI_Comm_size(FTI_COMM_WORLD, &nRanks);
71: #endif
72: }
73: 
74: void destroyParallel()
75: {
76: #ifdef DO_MPI
77:    FTI_Finalize();
78:    MPI_Finalize();
79: #endif
80: }

Results

Log of run without FTI integrated.

===============================================================================
                   Poznan Supercomputing and Networking Center
                               eagle.man.poznan.pl
===============================================================================
-------------------------------------------------------------------------------
Start of calculations [pon, 9 paź 2017, 12:57:49 CEST]
-------------------------------------------------------------------------------
Support:        [email protected]
-------------------------------------------------------------------------------
Mon Oct  9 12:57:53 2017: Starting Initialization
Mini-Application Name    : CoMD-mpi
Mini-Application Version : 1.1
Platform:
  hostname: e0026
  kernel name: 'Linux'
  kernel release: '3.10.105-1.el6.elrepo.x86_64'
  processor: 'x86_64'
Build:
  CC: '/opt/exp_soft/local/generic/openmpi/1.10.2-1_gcc482/bin/mpicc'
  compiler version: 'gcc (GCC) 4.8.2 20140120 (Red Hat 4.8.2-14)'
  CFLAGS: '-std=c99 -DDOUBLE -DDO_MPI -g -O5  -I/home/users/ksiero1/fti/include/ '
  LDFLAGS: '-L/home/users/ksiero1/fti/lib/ -lm -lcrypto'
  using MPI: true
  Threading: none
  Double Precision: true
Run Date/Time: 2017-10-09, 12:57:53
Command Line Parameters:
  doeam: 0
  potDir: pots
  potName: Cu_u6.eam
  potType: funcfl
  nx: 800
  ny: 800
  nz: 800
  xproc: 8
  yproc: 8
  zproc: 8
  Lattice constant: -1 Angstroms
  nSteps: 100
  printRate: 10
  Time step: 1 fs
  Initial Temperature: 600 K
  Initial Delta: 0 Angstroms
Simulation data: 
  Total atoms        : 2048000000
  Min global bounds  : [   0.0000000000,   0.0000000000,   0.0000000000 ]
  Max global bounds  : [ 2892.0000000000, 2892.0000000000, 2892.0000000000 ]
Decomposition data: 
  Processors         :      8,     8,     8
  Local boxes        :     62,    62,    62 =   238328
  Box size           : [   5.8306451613,   5.8306451613,   5.8306451613 ]
  Box factor         : [   1.0074548875,   1.0074548875,   1.0074548875 ] 
  Max Link Cell Occupancy: 32 of 64
Potential data: 
  Potential type   : Lennard-Jones
  Species name     : Cu
  Atomic number    : 29
  Mass             : 63.55 amu
  Lattice Type     : FCC
  Lattice spacing  : 3.615 Angstroms
  Cutoff           : 5.7875 Angstroms
  Epsilon          : 0.167 eV
  Sigma            : 2.315 Angstroms
Memory data: 
  Intrinsic atom footprint =   88 B/atom 
  Total atom footprint     = -157.000 MB (335.69 MB/node)
  Link cell atom footprint = 1280.082 MB/node
  Link cell atom footprint = 1408.000 MB/node (including halo cell data
Initial energy : -1.166063303598, atom count : 2048000000 
Mon Oct  9 12:58:06 2017: Initialization Finished
Mon Oct  9 12:58:06 2017: Starting simulation
#                                                                                         Performance
#  Loop   Time(fs)       Total Energy   Potential Energy     Kinetic Energy  Temperature   (us/atom)     # Atoms
      0       0.00    -1.166063303598    -1.243619295198     0.077555991600     600.0000     0.0000   2048000000
     10      10.00    -1.166059649733    -1.233151964368     0.067092314635     519.0494     2.3384   2048000000
     20      20.00    -1.166048425247    -1.208164731096     0.042116305849     325.8263     2.4122   2048000000
     30      30.00    -1.166037572103    -1.186566075400     0.020528503297     158.8156     2.4182   2048000000
     40      40.00    -1.166042088520    -1.183621872290     0.017579783770     136.0033     2.4197   2048000000
     50      50.00    -1.166051685771    -1.193725983586     0.027674297815     214.0979     2.4213   2048000000
     60      60.00    -1.166054644001    -1.202677534791     0.036622890790     283.3274     2.4201   2048000000
     70      70.00    -1.166052134038    -1.204922829363     0.038870695326     300.7172     2.4207   2048000000
     80      80.00    -1.166048793793    -1.203643980438     0.037595186645     290.8494     2.4198   2048000000
     90      90.00    -1.166048002607    -1.203830919192     0.037782916585     292.3017     2.4193   2048000000
    100     100.00    -1.166049790544    -1.206871500823     0.040821710279     315.8109     2.4176   2048000000
Mon Oct  9 13:14:11 2017: Ending simulation
Simulation Validation:
  Initial energy  : -1.166063303598
  Final energy    : -1.166049790544
  eFinal/eInitial : 0.999988
  Final atom count : 2048000000, no atoms lost
Timings for Rank 0
        Timer        # Calls    Avg/Call (s)   Total (s)    % Loop
___________________________________________________________________
total                      1     977.2662      977.2662      101.34
loop                       1     964.3040      964.3040      100.00
timestep                  10      96.4285      964.2854      100.00
  position               100       0.1001       10.0087        1.04
  velocity               200       0.1025       20.5055        2.13
  redistribute           101       1.2731      128.5869       13.33
    atomHalo             101       0.9613       97.0902       10.07
  force                  101       7.8922      797.1134       82.66
commHalo                 303       0.3041       92.1548        9.56
commReduce                39       0.4388       17.1143        1.77
Timing Statistics Across 512 Ranks:
        Timer        Rank: Min(s)       Rank: Max(s)      Avg(s)    Stdev(s)
_____________________________________________________________________________
total               51:  977.2630     140:  977.2672    977.2650      0.0012
loop                67:  964.3023     104:  964.3043    964.3033      0.0008
timestep            51:  964.2738     463:  964.2999    964.2841      0.0061
  position          49:    4.5466     373:   16.3177     11.2964      3.3782
  velocity           3:    7.8438     329:   29.0049     20.1119      6.3239
  redistribute      51:   53.6754     481:  168.3860    127.0497     17.6334
    atomHalo        51:   24.1044     481:  142.1157     94.3223     21.2231
  force            323:  775.9471      51:  905.1509    795.6917      9.6434
commHalo            51:   19.3264     481:  137.4904     89.3663     21.3808
commReduce          51:    2.8272     339:   27.7147     19.1658      4.1010
---------------------------------------------------
 Average atom update rate:       2.41 us/atom/task
---------------------------------------------------
---------------------------------------------------
 Average all atom update rate:   0.00 us/atom
---------------------------------------------------
---------------------------------------------------
 Average atom rate:            212.39 atoms/us
---------------------------------------------------
Mon Oct  9 13:14:11 2017: CoMD Ending
-------------------------------------------------------------------------------
End of calculations [pon, 9 paź 2017, 13:14:11 CEST].
-------------------------------------------------------------------------------

Log of run with FTI integrated.

===============================================================================
                   Poznan Supercomputing and Networking Center
                               eagle.man.poznan.pl
===============================================================================
-------------------------------------------------------------------------------
Start of calculations [pon, 9 paź 2017, 12:14:02 CEST]
-------------------------------------------------------------------------------
Support:        [email protected]
-------------------------------------------------------------------------------
[ FTI  Information ] : Reading FTI configuration file (/home/users/ksiero1/CoMD/bin/config.fti)... 
[ FTI  Information ] : The execution ID is: 2017-10-09_12-14-07 
[ FTI  Information ] : Selected Ckpt I/O is POSIX 
[ FTI  Information ] : FTI has been initialized. 
Mon Oct  9 12:14:08 2017: Starting Initialization
Mini-Application Name    : CoMD-mpi
Mini-Application Version : 1.1
Platform:
  hostname: e0026
  kernel name: 'Linux'
  kernel release: '3.10.105-1.el6.elrepo.x86_64'
  processor: 'x86_64'
Build:
  CC: '/opt/exp_soft/local/generic/openmpi/1.10.2-1_gcc482/bin/mpicc'
  compiler version: 'gcc (GCC) 4.8.2 20140120 (Red Hat 4.8.2-14)'
  CFLAGS: '-std=c99 -DDOUBLE -DDO_MPI -g -O5  -I/home/users/ksiero1/fti/include/ '
  LDFLAGS: '-L/home/users/ksiero1/fti/lib/ -lm -lcrypto'
  using MPI: true
  Threading: none
  Double Precision: true
Run Date/Time: 2017-10-09, 12:14:08
Command Line Parameters:
  doeam: 0
  potDir: pots
  potName: Cu_u6.eam
  potType: funcfl
  nx: 800
  ny: 800
  nz: 800
  xproc: 8
  yproc: 8
  zproc: 8
  Lattice constant: -1 Angstroms
  nSteps: 100
  printRate: 10
  Time step: 1 fs
  Initial Temperature: 600 K
  Initial Delta: 0 Angstroms
[ FTI  Information ] : Variable ID 1 to protect. Current ckpt. size per rank is 1.00MB. 
[ FTI  Information ] : Variable ID 2 to protect. Current ckpt. size per rank is 65.00MB. 
[ FTI  Information ] : Variable ID 3 to protect. Current ckpt. size per rank is 129.00MB. 
[ FTI  Information ] : Variable ID 4 to protect. Current ckpt. size per rank is 513.00MB. 
[ FTI  Information ] : Variable ID 5 to protect. Current ckpt. size per rank is 897.00MB. 
[ FTI  Information ] : Variable ID 6 to protect. Current ckpt. size per rank is 1281.00MB. 
[ FTI  Information ] : Variable ID 7 to protect. Current ckpt. size per rank is 1281.00MB. 
Simulation data: 
  Total atoms        : 2048000000
  Min global bounds  : [   0.0000000000,   0.0000000000,   0.0000000000 ]
  Max global bounds  : [ 2892.0000000000, 2892.0000000000, 2892.0000000000 ]
Decomposition data: 
  Processors         :      8,     8,     8
  Local boxes        :     62,    62,    62 =   238328
  Box size           : [   5.8306451613,   5.8306451613,   5.8306451613 ]
  Box factor         : [   1.0074548875,   1.0074548875,   1.0074548875 ] 
  Max Link Cell Occupancy: 32 of 64
Potential data: 
  Potential type   : Lennard-Jones
  Species name     : Cu
  Atomic number    : 29
  Mass             : 63.55 amu
  Lattice Type     : FCC
  Lattice spacing  : 3.615 Angstroms
  Cutoff           : 5.7875 Angstroms
  Epsilon          : 0.167 eV
  Sigma            : 2.315 Angstroms
Memory data: 
  Intrinsic atom footprint =   88 B/atom 
  Total atom footprint     = -157.000 MB (335.69 MB/node)
  Link cell atom footprint = 1280.082 MB/node
  Link cell atom footprint = 1408.000 MB/node (including halo cell data
Initial energy : -1.166063303598, atom count : 2048000000 
Mon Oct  9 12:14:21 2017: Initialization Finished
Mon Oct  9 12:14:21 2017: Starting simulation
#                                                                                         Performance
#  Loop   Time(fs)       Total Energy   Potential Energy     Kinetic Energy  Temperature   (us/atom)     # Atoms
      0       0.00    -1.166063303598    -1.243619295198     0.077555991600     600.0000     0.0000   2048000000
[ FTI  Information ] : Post-checkpoint took 1.77 sec. (Pt:1.75s, Cl:0.02s) 
[ FTI  Information ] : Ckpt. ID 10 (L1) (1281.00 MB/proc) taken in 35.45 sec. (Wt:0.00s, Wr:33.68s, Ps:1.77s) 
     10      10.00    -1.166059649733    -1.233151964368     0.067092314635     519.0494     2.4020   2048000000
[ FTI  Information ] : Post-checkpoint took 0.53 sec. (Pt:0.39s, Cl:0.14s) 
[ FTI  Information ] : Ckpt. ID 20 (L1) (1281.00 MB/proc) taken in 38.83 sec. (Wt:0.00s, Wr:38.30s, Ps:0.53s) 
     20      20.00    -1.166048425247    -1.208164731096     0.042116305849     325.8263     2.4509   2048000000
[ FTI  Information ] : Post-checkpoint took 3.89 sec. (Pt:2.76s, Cl:1.13s) 
[ FTI  Information ] : Ckpt. ID 30 (L1) (1281.00 MB/proc) taken in 38.64 sec. (Wt:0.00s, Wr:34.75s, Ps:3.89s) 
     30      30.00    -1.166037572103    -1.186566075400     0.020528503297     158.8156     2.5215   2048000000
[ FTI  Information ] : Post-checkpoint took 3.57 sec. (Pt:2.95s, Cl:0.62s) 
[ FTI  Information ] : Ckpt. ID 40 (L1) (1281.00 MB/proc) taken in 40.21 sec. (Wt:0.00s, Wr:36.63s, Ps:3.57s) 
     40      40.00    -1.166042088520    -1.183621872290     0.017579783770     136.0033     2.2746   2048000000
[ FTI  Information ] : Post-checkpoint took 3.71 sec. (Pt:3.30s, Cl:0.41s) 
[ FTI  Information ] : Ckpt. ID 50 (L1) (1281.00 MB/proc) taken in 38.81 sec. (Wt:0.00s, Wr:35.10s, Ps:3.71s) 
     50      50.00    -1.166051685771    -1.193725983586     0.027674297815     214.0979     2.2883   2048000000
[ FTI  Information ] : Post-checkpoint took 2.26 sec. (Pt:1.76s, Cl:0.50s) 
[ FTI  Information ] : Ckpt. ID 60 (L1) (1281.00 MB/proc) taken in 38.06 sec. (Wt:0.00s, Wr:35.80s, Ps:2.26s) 
     60      60.00    -1.166054644001    -1.202677534791     0.036622890790     283.3274     2.3185   2048000000
[ FTI  Information ] : Post-checkpoint took 1.11 sec. (Pt:0.23s, Cl:0.87s) 
[ FTI  Information ] : Ckpt. ID 70 (L1) (1281.00 MB/proc) taken in 43.70 sec. (Wt:0.00s, Wr:42.59s, Ps:1.11s) 
--------------------------------------------------------------------------
mpirun has exited due to process rank 416 with PID 0 on
node e0769 exiting improperly. There are three reasons this could occur:
1. this process did not call "init" before exiting, but others in
the job did. This can cause a job to hang indefinitely while it waits
for all processes to call "init". By rule, if one process calls "init",
then ALL processes must call "init" prior to termination.
2. this process called "init", but exited without calling "finalize".
By rule, all processes that call "init" MUST call "finalize" prior to
exiting or it will be considered an "abnormal termination"
3. this process called "MPI_Abort" or "orte_abort" and the mca parameter
orte_create_session_dirs is set to false. In this case, the run-time cannot
detect that the abort call was an abnormal termination. Hence, the only
error message you will receive is this one.
This may have caused other processes in the application to be
terminated by signals sent by mpirun (as reported here).
You can avoid this message by specifying -quiet on the mpirun command line.
--------------------------------------------------------------------------
[ FTI  Information ] : Reading FTI configuration file (/home/users/ksiero1/CoMD/bin/config.fti)... 
[ FTI  Information ] : This is a restart. The execution ID is: 2017-10-09_12-14-07 
[ FTI  Information ] : Selected Ckpt I/O is POSIX 
[ FTI  Information ] : Recovering successfully from level 1. 
[ FTI  Information ] : FTI has been initialized. 
Mon Oct  9 12:30:09 2017: Starting Initialization
Mini-Application Name    : CoMD-mpi
Mini-Application Version : 1.1
Platform:
  hostname: e0026
  kernel name: 'Linux'
  kernel release: '3.10.105-1.el6.elrepo.x86_64'
  processor: 'x86_64'
Build:
  CC: '/opt/exp_soft/local/generic/openmpi/1.10.2-1_gcc482/bin/mpicc'
  compiler version: 'gcc (GCC) 4.8.2 20140120 (Red Hat 4.8.2-14)'
  CFLAGS: '-std=c99 -DDOUBLE -DDO_MPI -g -O5  -I/home/users/ksiero1/fti/include/ '
  LDFLAGS: '-L/home/users/ksiero1/fti/lib/ -lm -lcrypto'
  using MPI: true
  Threading: none
  Double Precision: true
Run Date/Time: 2017-10-09, 12:30:09
Command Line Parameters:
  doeam: 0
  potDir: pots
  potName: Cu_u6.eam
  potType: funcfl
  nx: 800
  ny: 800
  nz: 800
  xproc: 8
  yproc: 8
  zproc: 8
  Lattice constant: -1 Angstroms
  nSteps: 100
  printRate: 10
  Time step: 1 fs
  Initial Temperature: 600 K
  Initial Delta: 0 Angstroms
[ FTI  Information ] : Variable ID 1 to protect. Current ckpt. size per rank is 1.00MB. 
[ FTI  Information ] : Variable ID 2 to protect. Current ckpt. size per rank is 65.00MB. 
[ FTI  Information ] : Variable ID 3 to protect. Current ckpt. size per rank is 129.00MB. 
[ FTI  Information ] : Variable ID 4 to protect. Current ckpt. size per rank is 513.00MB. 
[ FTI  Information ] : Variable ID 5 to protect. Current ckpt. size per rank is 897.00MB. 
[ FTI  Information ] : Variable ID 6 to protect. Current ckpt. size per rank is 1281.00MB. 
[ FTI  Information ] : Variable ID 7 to protect. Current ckpt. size per rank is 1281.00MB. 
Simulation data: 
  Total atoms        : 2048000000
  Min global bounds  : [   0.0000000000,   0.0000000000,   0.0000000000 ]
  Max global bounds  : [ 2892.0000000000, 2892.0000000000, 2892.0000000000 ]
Decomposition data: 
  Processors         :      8,     8,     8
  Local boxes        :     62,    62,    62 =   238328
  Box size           : [   5.8306451613,   5.8306451613,   5.8306451613 ]
  Box factor         : [   1.0074548875,   1.0074548875,   1.0074548875 ] 
  Max Link Cell Occupancy: 32 of 64
Potential data: 
  Potential type   : Lennard-Jones
  Species name     : Cu
  Atomic number    : 29
  Mass             : 63.55 amu
  Lattice Type     : FCC
  Lattice spacing  : 3.615 Angstroms
  Cutoff           : 5.7875 Angstroms
  Epsilon          : 0.167 eV
  Sigma            : 2.315 Angstroms
Memory data: 
  Intrinsic atom footprint =   88 B/atom 
  Total atom footprint     = -157.000 MB (335.69 MB/node)
  Link cell atom footprint = 1280.082 MB/node
  Link cell atom footprint = 1408.000 MB/node (including halo cell data
Initial energy : -1.166063303598, atom count : 2048000000 
Mon Oct  9 12:30:22 2017: Initialization Finished
Mon Oct  9 12:30:22 2017: Starting simulation
#                                                                                         Performance
#  Loop   Time(fs)       Total Energy   Potential Energy     Kinetic Energy  Temperature   (us/atom)     # Atoms
     70      70.00    -1.166063303598    -1.243619295198     0.077555991600     600.0000     0.0000   2048000000
[ FTI  Information ] : Post-checkpoint took 0.47 sec. (Pt:0.28s, Cl:0.19s) 
[ FTI  Information ] : Ckpt. ID 80 (L1) (1281.00 MB/proc) taken in 42.96 sec. (Wt:0.00s, Wr:42.49s, Ps:0.47s) 
     80      80.00    -1.166048793793    -1.203643980438     0.037595186645     290.8494     2.2586   2048000000
[ FTI  Information ] : Post-checkpoint took 0.78 sec. (Pt:0.22s, Cl:0.56s) 
[ FTI  Information ] : Ckpt. ID 90 (L1) (1281.00 MB/proc) taken in 34.50 sec. (Wt:0.00s, Wr:33.72s, Ps:0.78s) 
     90      90.00    -1.166048002607    -1.203830919192     0.037782916585     292.3017     2.3377   2048000000
[ FTI  Information ] : Post-checkpoint took 0.68 sec. (Pt:0.16s, Cl:0.51s) 
[ FTI  Information ] : Ckpt. ID 100 (L1) (1281.00 MB/proc) taken in 33.82 sec. (Wt:0.00s, Wr:33.15s, Ps:0.68s) 
    100     100.00    -1.166049790544    -1.206871500823     0.040821710279     315.8109     2.3146   2048000000
Mon Oct  9 12:36:51 2017: Ending simulation
Simulation Validation:
  Initial energy  : -1.166063303598
  Final energy    : -1.166049790544
  eFinal/eInitial : 0.999988
  Final atom count : 2048000000, no atoms lost
Timings for Rank 0
        Timer        # Calls    Avg/Call (s)   Total (s)    % Loop
___________________________________________________________________
total                      1     401.7819      401.7819      103.47
loop                       1     388.3197      388.3197      100.00
timestep                   3      92.1447      276.4340       71.19
  position                30       0.1194        3.5818        0.92
  velocity                60       0.1042        6.2535        1.61
  redistribute            31       0.8148       25.2584        6.50
    atomHalo              31       0.4576       14.1847        3.65
  force                   31       8.0059      248.1816       63.91
commHalo                  93       0.1349       12.5416        3.23
commReduce                18       0.1819        3.2744        0.84
Timing Statistics Across 512 Ranks:
        Timer        Rank: Min(s)       Rank: Max(s)      Avg(s)    Stdev(s)
_____________________________________________________________________________
total               37:  401.7036      42:  401.9202    401.7762      0.0357
loop                34:  388.3196     370:  388.4222    388.3436      0.0197
timestep            79:  276.2655     235:  276.5081    276.4445      0.0336
  position         147:    1.3705     221:    6.0085      3.6971      0.9796
  velocity         147:    2.3765     206:    9.8759      6.5194      1.7920
  redistribute     206:   18.4708     417:   37.1847     25.1751      4.2695
    atomHalo       415:    5.8228     417:   29.1367     13.9548      5.2873
  force            481:  241.1729      10:  261.1229    247.7969      2.5561
commHalo           415:    4.1255     417:   27.6639     12.3329      5.3435
commReduce         415:    1.4558     193:    6.4059      3.3989      0.9957
---------------------------------------------------
 Average atom update rate:       2.30 us/atom/task
---------------------------------------------------
---------------------------------------------------
 Average all atom update rate:   0.00 us/atom
---------------------------------------------------
---------------------------------------------------
 Average atom rate:            222.25 atoms/us
---------------------------------------------------
Mon Oct  9 12:36:52 2017: CoMD Ending
-------------------------------------------------------------------------------
End of calculations [pon, 9 paź 2017, 12:36:53 CEST].
-------------------------------------------------------------------------------

CoSP2

⬆️ Top
Linear algebra algorithms and workloads for a quantum molecular dynamics (QMD) electronic structure code.

https://github.com/exmatex/CoSP2

File changes

Integrating FTI in CoSP2 took only addition of ~30 lines of code in 2 files. All occurrences of MPI_COMM_WORLD changed to FTI_COMM_WORLD except FTI_Init("config.fti", MPI_COMM_WORLD);

File: src-mpi/sp2Loop.c 
Function: sp2Loop()
56:     FTIT_type RealTInfo;
57:     FTI_InitType(&RealTInfo, sizeof(real_t));
58:     int i = 1;
59:     FTI_Protect(i++, &iter, 1, FTI_INTG);
60:     FTI_Protect(i++, xmatrix->iia, xmatrix->hsize, FTI_INTG);
61:     FTI_Protect(i++, xmatrix->jjcontig, xmatrix->hsize * xmatrix->msize , FTI_INTG);
62:     FTI_Protect(i++, xmatrix->valcontig, xmatrix->hsize * xmatrix->msize, RealTInfo);
63: 
64:     if (FTI_Status() != 0) {
65:       int res = FTI_Recover();
66:       if (res != 0) {
67:          printf("\tRecovery failed! FTI_Recover returned %d.\n", res);
68:       }
69:     }
70: 
...
153:     if (iter % 10 == 0) {
154:         int res = FTI_Checkpoint(iter, 1);
155:         if (res != FTI_DONE) {
156:               printf("\tCheckpoint failed! FTI_Checkpoint returned %d.\n", res);
157:         }
158:     }
File: src-mpi/parallel.c
70: void initParallel(int* argc, char*** argv)
71: {
72: #ifdef DO_MPI
73:    MPI_Init(argc, argv);
74:    FTI_Init("config.fti", MPI_COMM_WORLD);
75:    MPI_Comm_rank(FTI_COMM_WORLD, &myRank);
76:    MPI_Comm_size(FTI_COMM_WORLD, &nRanks);
77: 
78:    requestList = (MPI_Request*) malloc(nRanks*sizeof(MPI_Request));
79:    rUsed = (int*) malloc(nRanks*sizeof(int));
80:    for (int i = 0; i < nRanks; i++) { rUsed[i] = 0; }
81: #endif
82: }
83: 
84: void destroyParallel()
85: {
86: #ifdef DO_MPI
87:    free(requestList);
88:    FTI_Finalize();
89:    MPI_Finalize();
90: #endif
91: }

Results

Log of run without FTI integrated.

===============================================================================
                   Poznan Supercomputing and Networking Center

                               eagle.man.poznan.pl
===============================================================================
-------------------------------------------------------------------------------
Support:        [email protected]
-------------------------------------------------------------------------------
CoSP2: SP2 Loop

Parameters:
msparse = 80  hDim = 98304  debug = 1
hmatName = 
eps = 1e-05  hEps = 1e-16
idemTol = 1e-14

hDim = 98304 M = 80
Adjusted M = 96
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 3 local row min = 18432  row max = 24576  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 1 local row min = 6144  row max = 12288  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 11 local row min = 67584  row max = 73728  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 13 local row min = 79872  row max = 86016  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 8 local row min = 49152  row max = 55296  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 7 local row min = 43008  row max = 49152  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 6 local row min = 36864  row max = 43008  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 4 local row min = 24576  row max = 30720  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 9 local row min = 55296  row max = 61440  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 10 local row min = 61440  row max = 67584  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 14 local row min = 86016  row max = 92160  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 12 local row min = 73728  row max = 79872  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 2 local row min = 12288  row max = 18432  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
total procs = 16  total rows = 98304  total cols = 96
global row min = 0  row max = 98304  row extent = 98304
rank = 0 local row min = 0  row max = 6144  row extent = 6144

Sparsity:
Initial sparsity = 672042, fraction = 6.258879e-04, Avg per row = 6.836365
Max per row = 7
I = 4, count = 2, fraction = 0.000020
I = 5, count = 621, fraction = 0.006317
I = 6, count = 14838, fraction = 0.150940
I = 7, count = 82843, fraction = 0.842723
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 15 local row min = 92160  row max = 98304  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 5 local row min = 30720  row max = 36864  row extent = 6144

Gershgorin:
New  eMax, eMin = 1.745500e+00, -7.356212e-01
bufferSize = 9437184
Initial sparsity normalized = 672042, fraction = 6.258879e-04,  avg = 6.83636, max = 7

SP2Loop:
iter = 0  trX = 4.935743e+04  trX2 = 2.720037e+04
iter = 1  trX = 2.720037e+04  trX2 = 9.994787e+03
iter = 2  trX = 4.440595e+04  trX2 = 2.485384e+04
iter = 3  trX = 6.395806e+04  trX2 = 4.735425e+04
iter = 4  trX = 4.735425e+04  trX2 = 3.149323e+04
iter = 5  trX = 6.321528e+04  trX2 = 5.026180e+04
iter = 6  trX = 5.026180e+04  trX2 = 3.881328e+04
iter = 7  trX = 3.881328e+04  trX2 = 2.922713e+04
iter = 8  trX = 4.839943e+04  trX2 = 4.062611e+04
iter = 9  trX = 5.617275e+04  trX2 = 4.981154e+04
iter = 10  trX = 4.981154e+04  trX2 = 4.464542e+04
iter = 11  trX = 4.464542e+04  trX2 = 4.032639e+04
iter = 12  trX = 4.896445e+04  trX2 = 4.554145e+04
iter = 13  trX = 5.238745e+04  trX2 = 4.956883e+04
iter = 14  trX = 4.956883e+04  trX2 = 4.731790e+04
iter = 15  trX = 4.731790e+04  trX2 = 4.544718e+04
iter = 16  trX = 4.918861e+04  trX2 = 4.771064e+04
iter = 17  trX = 4.771064e+04  trX2 = 4.649398e+04
iter = 18  trX = 4.892731e+04  trX2 = 4.795556e+04
iter = 19  trX = 4.989906e+04  trX2 = 4.910173e+04
iter = 20  trX = 4.910173e+04  trX2 = 4.855031e+04
iter = 21  trX = 4.965316e+04  trX2 = 5.060054e+04
iter = 22  trX = 4.870578e+04  trX2 = -9.750371e+05
iter = 23  trX = 1.072449e+06  trX2 = -5.136388e+12
iter = 24  trX = -5.136388e+12  trX2 = 7.295617e+24

Results:
X2 Sparsity CCN = 2906510, fraction = 2.706898e-03 avg = 29.5665, max = 89
D Sparsity AAN = 2906464, fraction = 2.706856e-03 avg = 29.5661, max = 89
Number of iterations = 25


Counters for Rank 0
       Counter          Calls    Avg/Call(MB)         Total(MB)   
_________________________________________________________________
reduce                    29           0.0000            0.0004
send                      39           2.2910           89.3504
recv                      39           2.2772           88.8095

Counter Statistics Across 16 Ranks:
        Counter      Rank: Min(MB)        Rank: Max(MB)       Avg(MB)      Stdev(MB)
_______________________________________________________________________________________
reduce               0:      0.0004       0:      0.0004        0.0004        0.0000
send                15:     87.4100       7:    138.5495      129.9097       15.7564
recv                15:     88.4093       6:    137.0340      129.9097       15.6236


Timings for Rank 0
        Timer        # Calls    Avg/Call (s)   Total (s)    % Loop
___________________________________________________________________
total                      1       3.4711        3.4711      100.00
loop                       1       3.4711        3.4711      100.00
  pre                      1       0.5444        0.5444       15.68
  sp2Loop                  1       2.7193        2.7193       78.34
    norm                   1       0.0417        0.0417        1.20
    x2                    25       0.0473        1.1820       34.05
    xadd                  13       0.0454        0.5899       16.99
    xset                  12       0.0383        0.4591       13.23
    exchange              50       0.0032        0.1576        4.54
    reduceComm            29       0.0070        0.2034        5.86

Timing Statistics Across 16 Ranks:
        Timer        Rank: Min(s)       Rank: Max(s)      Avg(s)    Stdev(s)
_____________________________________________________________________________
total                1:    3.4591      15:    3.5566      3.5160      0.0296
loop                 1:    3.4591      15:    3.5566      3.5160      0.0296
  pre                3:    0.4203       5:    0.5927      0.5180      0.0440
  sp2Loop           15:    2.7191      12:    2.7256      2.7229      0.0019
    norm             3:    0.0082       7:    0.0450      0.0376      0.0112
    x2               1:    0.2678      15:    1.1916      1.0701      0.3027
    xadd             1:    0.0548       0:    0.5899      0.5167      0.1744
    xset             1:    0.0408      15:    0.4638      0.4071      0.1383
    exchange         0:    0.1576       1:    1.1532      0.3589      0.2991
    reduceComm       5:    0.0513       3:    1.4170      0.3006      0.4217
-------------------------------------------------------------------------------

End of calculations [pon, 16 paź 2017, 12:17:13 CEST].

-------------------------------------------------------------------------------

Log of run with FTI integrated.

-------------------------------------------------------------------------------
Support:        [email protected]
-------------------------------------------------------------------------------
[ FTI  Information ] : Reading FTI configuration file (/home/users/ksiero1/CoSP2/bin/config.fti)... 
[ FTI  Information ] : The execution ID is: 2017-10-16_12-02-59 
[ FTI  Information ] : FTI has been initialized. 
CoSP2: SP2 Loop

Parameters:
msparse = 80  hDim = 98304  debug = 1
hmatName = 
eps = 1e-05  hEps = 1e-16
idemTol = 1e-14

hDim = 98304 M = 80
Adjusted M = 96
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 1 local row min = 6144  row max = 12288  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 3 local row min = 18432  row max = 24576  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 10 local row min = 61440  row max = 67584  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
total procs = 16  total rows = 98304  total cols = 96
global row min = 0  row max = 98304  row extent = 98304
rank = 0 local row min = 0  row max = 6144  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 5 local row min = 30720  row max = 36864  row extent = 6144

Sparsity:
Initial sparsity = 672042, fraction = 6.258879e-04, Avg per row = 6.836365
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 6 local row min = 36864  row max = 43008  row extent = 6144
Max per row = 7
I = 4, count = 2, fraction = 0.000020
I = 5, count = 621, fraction = 0.006317
I = 6, count = 14838, fraction = 0.150940
I = 7, count = 82843, fraction = 0.842723
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 13 local row min = 79872  row max = 86016  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 7 local row min = 43008  row max = 49152  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 12 local row min = 73728  row max = 79872  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 15 local row min = 92160  row max = 98304  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 8 local row min = 49152  row max = 55296  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 11 local row min = 67584  row max = 73728  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 14 local row min = 86016  row max = 92160  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 4 local row min = 24576  row max = 30720  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 9 local row min = 55296  row max = 61440  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 2 local row min = 12288  row max = 18432  row extent = 6144

Gershgorin:
New  eMax, eMin = 1.745500e+00, -7.356212e-01
bufferSize = 9437184
Initial sparsity normalized = 672042, fraction = 6.258879e-04,  avg = 6.83636, max = 7
[ FTI  Information ] : Variable ID 1 to protect. Current ckpt. size per rank is 0.00MB. 
[ FTI  Information ] : Variable ID 2 to protect. Current ckpt. size per rank is 0.38MB. 
[ FTI  Information ] : Variable ID 3 to protect. Current ckpt. size per rank is 36.38MB. 
[ FTI  Information ] : Variable ID 4 to protect. Current ckpt. size per rank is 108.38MB. 
[ FTI  Information ] : Variable ID 5 to protect. Current ckpt. size per rank is 108.75MB. 
[ FTI  Information ] : Variable ID 6 to protect. Current ckpt. size per rank is 144.75MB. 
[ FTI  Information ] : Variable ID 7 to protect. Current ckpt. size per rank is 216.75MB. 

SP2Loop:
iter = 0  trX = 4.935743e+04  trX2 = 2.720037e+04
iter = 1  trX = 2.720037e+04  trX2 = 9.994787e+03
iter = 2  trX = 4.440595e+04  trX2 = 2.485384e+04
iter = 3  trX = 6.395806e+04  trX2 = 4.735425e+04
iter = 4  trX = 4.735425e+04  trX2 = 3.149323e+04
iter = 5  trX = 6.321528e+04  trX2 = 5.026180e+04
iter = 6  trX = 5.026180e+04  trX2 = 3.881328e+04
iter = 7  trX = 3.881328e+04  trX2 = 2.922713e+04
iter = 8  trX = 4.839943e+04  trX2 = 4.062611e+04
iter = 9  trX = 5.617275e+04  trX2 = 4.981154e+04
[ FTI  Information ] : Post-checkpoint took 0.00 sec. (Pt:0.00s, Cl:0.00s) 
[ FTI  Information ] : Ckpt. ID 10 (L1) (216.75 MB/proc) taken in 9.00 sec. (Wt:0.00s, Wr:9.00s, Ps:0.00s) 
iter = 10  trX = 4.981154e+04  trX2 = 4.464542e+04
iter = 11  trX = 4.464542e+04  trX2 = 4.032639e+04
iter = 12  trX = 4.896445e+04  trX2 = 4.554145e+04
iter = 13  trX = 5.238745e+04  trX2 = 4.956883e+04
iter = 14  trX = 4.956883e+04  trX2 = 4.731790e+04
--------------------------------------------------------------------------
mpirun has exited due to process rank 3 with PID 12638 on
node e0700 exiting improperly. There are three reasons this could occur:

1. this process did not call "init" before exiting, but others in
the job did. This can cause a job to hang indefinitely while it waits
for all processes to call "init". By rule, if one process calls "init",
then ALL processes must call "init" prior to termination.

2. this process called "init", but exited without calling "finalize".
By rule, all processes that call "init" MUST call "finalize" prior to
exiting or it will be considered an "abnormal termination"

3. this process called "MPI_Abort" or "orte_abort" and the mca parameter
orte_create_session_dirs is set to false. In this case, the run-time cannot
detect that the abort call was an abnormal termination. Hence, the only
error message you will receive is this one.

This may have caused other processes in the application to be
terminated by signals sent by mpirun (as reported here).

You can avoid this message by specifying -quiet on the mpirun command line.

--------------------------------------------------------------------------
[ FTI  Information ] : Reading FTI configuration file (/home/users/ksiero1/CoSP2/bin/config.fti)... 
[ FTI  Information ] : This is a restart. The execution ID is: 2017-10-16_12-02-59 
[ FTI  Information ] : Recovering successfully from level 1. 
[ FTI  Information ] : FTI has been initialized. 
CoSP2: SP2 Loop

Parameters:
msparse = 80  hDim = 98304  debug = 1
hmatName = 
eps = 1e-05  hEps = 1e-16
idemTol = 1e-14

hDim = 98304 M = 80
Adjusted M = 96
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 1 local row min = 6144  row max = 12288  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 3 local row min = 18432  row max = 24576  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 15 local row min = 92160  row max = 98304  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 4 local row min = 24576  row max = 30720  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 9 local row min = 55296  row max = 61440  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 13 local row min = 79872  row max = 86016  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 5 local row min = 30720  row max = 36864  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 11 local row min = 67584  row max = 73728  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 12 local row min = 73728  row max = 79872  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 8 local row min = 49152  row max = 55296  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 10 local row min = 61440  row max = 67584  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 6 local row min = 36864  row max = 43008  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 7 local row min = 43008  row max = 49152  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 14 local row min = 86016  row max = 92160  row extent = 6144
Generated H Matrix nnz = 672042 avg nnz/row = 6
total procs = 16  total rows = 98304  total cols = 96
global row min = 0  row max = 98304  row extent = 98304
rank = 0 local row min = 0  row max = 6144  row extent = 6144

Sparsity:
Initial sparsity = 672042, fraction = 6.258879e-04, Avg per row = 6.836365
Max per row = 7
I = 4, count = 2, fraction = 0.000020
I = 5, count = 621, fraction = 0.006317
I = 6, count = 14838, fraction = 0.150940
I = 7, count = 82843, fraction = 0.842723
Generated H Matrix nnz = 672042 avg nnz/row = 6
rank = 2 local row min = 12288  row max = 18432  row extent = 6144

Gershgorin:
New  eMax, eMin = 1.745500e+00, -7.356212e-01
bufferSize = 9437184
Initial sparsity normalized = 672042, fraction = 6.258879e-04,  avg = 6.83636, max = 7
[ FTI  Information ] : Variable ID 1 to protect. Current ckpt. size per rank is 0.00MB. 
[ FTI  Information ] : Variable ID 2 to protect. Current ckpt. size per rank is 0.38MB. 
[ FTI  Information ] : Variable ID 3 to protect. Current ckpt. size per rank is 36.38MB. 
[ FTI  Information ] : Variable ID 4 to protect. Current ckpt. size per rank is 108.38MB. 
[ FTI  Information ] : Variable ID 5 to protect. Current ckpt. size per rank is 108.75MB. 
[ FTI  Information ] : Variable ID 6 to protect. Current ckpt. size per rank is 144.75MB. 
[ FTI  Information ] : Variable ID 7 to protect. Current ckpt. size per rank is 216.75MB. 

SP2Loop:
iter = 10  trX = 4.981154e+04  trX2 = 4.464542e+04
iter = 11  trX = 4.464542e+04  trX2 = 4.032639e+04
iter = 12  trX = 4.896445e+04  trX2 = 4.554145e+04
iter = 13  trX = 5.238745e+04  trX2 = 4.956883e+04
iter = 14  trX = 4.956883e+04  trX2 = 4.731790e+04
iter = 15  trX = 4.731790e+04  trX2 = 4.544718e+04
iter = 16  trX = 4.918861e+04  trX2 = 4.771064e+04
iter = 17  trX = 4.771064e+04  trX2 = 4.649398e+04
iter = 18  trX = 4.892731e+04  trX2 = 4.795556e+04
iter = 19  trX = 4.989906e+04  trX2 = 4.910173e+04
[ FTI  Information ] : Post-checkpoint took 0.02 sec. (Pt:0.00s, Cl:0.02s) 
[ FTI  Information ] : Ckpt. ID 20 (L1) (216.75 MB/proc) taken in 1.18 sec. (Wt:0.00s, Wr:1.16s, Ps:0.02s) 
iter = 20  trX = 4.910173e+04  trX2 = 4.855031e+04
iter = 21  trX = 4.965316e+04  trX2 = 5.060054e+04
iter = 22  trX = 4.870578e+04  trX2 = -9.750371e+05
iter = 23  trX = 1.072449e+06  trX2 = -5.136388e+12
iter = 24  trX = -5.136388e+12  trX2 = 7.295617e+24

Results:
X2 Sparsity CCN = 2906510, fraction = 2.706898e-03 avg = 29.5665, max = 89
D Sparsity AAN = 2906464, fraction = 2.706856e-03 avg = 29.5661, max = 89
Number of iterations = 25


Counters for Rank 0
       Counter          Calls    Avg/Call(MB)         Total(MB)   
_________________________________________________________________
reduce                    19           0.0000            0.0003
send                      29           2.6508           76.8721
recv                      29           2.6315           76.3141

Counter Statistics Across 16 Ranks:
        Counter      Rank: Min(MB)        Rank: Max(MB)       Avg(MB)      Stdev(MB)
_______________________________________________________________________________________
reduce               0:      0.0003       0:      0.0003        0.0003        0.0000
send                15:     74.9711       7:    113.5838      106.5789       11.6620
recv                15:     75.9751       6:    112.0425      106.5789       11.5183


Timings for Rank 0
        Timer        # Calls    Avg/Call (s)   Total (s)    % Loop
___________________________________________________________________
total                      1       4.4862        4.4862      100.00
loop                       1       4.4862        4.4862      100.00
  pre                      1       0.5449        0.5449       12.15
  sp2Loop                  1       3.7464        3.7464       83.51
    norm                   1       0.0439        0.0439        0.98
    x2                    15       0.0423        0.6340       14.13
    xadd                   8       0.1030        0.8236       18.36
    xset                   7       0.0369        0.2582        5.76
    exchange              30       0.0033        0.0982        2.19
    reduceComm            19       0.0210        0.3999        8.91

Timing Statistics Across 16 Ranks:
        Timer        Rank: Min(s)       Rank: Max(s)      Avg(s)    Stdev(s)
_____________________________________________________________________________
total                1:    4.4668      10:    4.5643      4.5171      0.0276
loop                 1:    4.4668      10:    4.5643      4.5171      0.0276
  pre                1:    0.4197       2:    0.5793      0.5138      0.0399
  sp2Loop            9:    3.7438       8:    3.7513      3.7490      0.0019
    norm             1:    0.0081       6:    0.0463      0.0350      0.0112
    x2               3:    0.1789      13:    0.7067      0.5970      0.1598
    xadd             1:    0.0360      10:    0.8244      0.6514      0.2632
    xset             3:    0.0240       7:    0.2859      0.2339      0.0798
    exchange         0:    0.0982       3:    1.1791      0.4241      0.3105
    reduceComm       7:    0.1875       1:    1.2948      0.4136      0.3341
[ FTI  Information ] : FTI has been finalized. 
-------------------------------------------------------------------------------

End of calculations [pon, 16 paź 2017, 12:03:18 CEST].

-------------------------------------------------------------------------------

LULESH

⬆️ Top

Livermore Unstructured Lagrangian Explicit Shock Hydrodynamics (LULESH)

https://codesign.llnl.gov/lulesh.php

File changes

In order to perform the cast from a C++ object to a char buffer, BOOST serialization was used. Three files were modified to port FTI: lulesh.cc, lulesh.h and lulesh-comm.cc. The modifications to the first two files are shown here. The modifications to the third file were barely the replacements of MPI_COMM_WORLD by FTI_COMM_WORLD and are not listed here.

diff --git a/LULESH/lulesh.cc b/FTI_LULESH/lulesh.cc
index a141611..d5572f8 100644
--- a/LULESH/lulesh.cc
+++ b/FTI_LULESH/lulesh.cc
@@ -162,6 +162,22 @@ Additional BSD Notice
 
 #include "lulesh.h"
 
+//********************
+// Boost Serialization
+//********************
+#include <boost/archive/text_oarchive.hpp>
+#include <boost/archive/text_iarchive.hpp>
+
+#include <sstream>
+// --- File version ---
+#include <fstream>
+std::stringstream locDom_ser;
+
+//*************************
+// FTI Checkpoint - Restart
+//*************************
+#include <fti.h> 
+#define ITER_CKPT 500
 
 /*********************************/
 /* Data structure implementation */
@@ -213,7 +229,7 @@ void TimeIncrement(Domain& domain)
 #if USE_MPI      
       MPI_Allreduce(&gnewdt, &newdt, 1,
                     ((sizeof(Real_t) == 4) ? MPI_FLOAT : MPI_DOUBLE),
-                    MPI_MIN, MPI_COMM_WORLD) ;
+                    MPI_MIN, FTI_COMM_WORLD) ;
 #else
       newdt = gnewdt;
 #endif
@@ -1061,7 +1077,7 @@ void CalcHourglassControlForElems(Domain& domain,
       /* Do a check for negative volumes */
       if ( domain.v(i) <= Real_t(0.0) ) {
 #if USE_MPI         
-         MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
+         MPI_Abort(FTI_COMM_WORLD, VolumeError) ;
 #else
          exit(VolumeError);
 #endif
@@ -1111,7 +1127,7 @@ void CalcVolumeForceForElems(Domain& domain)
       for ( Index_t k=0 ; k<numElem ; ++k ) {
          if (determ[k] <= Real_t(0.0)) {
 #if USE_MPI            
-            MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
+            MPI_Abort(FTI_COMM_WORLD, VolumeError) ;
 #else
             exit(VolumeError);
 #endif
@@ -1626,7 +1642,7 @@ void CalcLagrangeElements(Domain& domain, Real_t* vnew)
          if (vnew[k] <= Real_t(0.0))
         {
 #if USE_MPI           
-           MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
+           MPI_Abort(FTI_COMM_WORLD, VolumeError) ;
 #else
            exit(VolumeError);
 #endif
@@ -2030,7 +2046,7 @@ void CalcQForElems(Domain& domain, Real_t vnew[])
 
       if(idx >= 0) {
 #if USE_MPI         
-         MPI_Abort(MPI_COMM_WORLD, QStopError) ;
+         MPI_Abort(FTI_COMM_WORLD, QStopError) ;
 #else
          exit(QStopError);
 #endif
@@ -2399,7 +2415,7 @@ void ApplyMaterialPropertiesForElems(Domain& domain, Real_t vnew[])
           }
           if (vc <= 0.) {
 #if USE_MPI             
-             MPI_Abort(MPI_COMM_WORLD, VolumeError) ;
+             MPI_Abort(FTI_COMM_WORLD, VolumeError) ;
 #else
              exit(VolumeError);
 #endif
@@ -2683,6 +2699,19 @@ void LagrangeLeapFrog(Domain& domain)
 #endif   
 }
 
+//Serialization 
+void save (Domain *dom_saved){
+  boost::archive::text_oarchive oa(locDom_ser);
+  oa << dom_saved;
+}
+
+//Deserialization
+Domain* load (){
+  Domain *dom_loaded;
+  boost::archive::text_iarchive ia(locDom_ser);
+  ia >> dom_loaded;
+  return dom_loaded;
+}
 
 /******************************************/
 
@@ -2697,8 +2726,10 @@ int main(int argc, char *argv[])
    Domain_member fieldData ;
 
    MPI_Init(&argc, &argv) ;
-   MPI_Comm_size(MPI_COMM_WORLD, &numRanks) ;
-   MPI_Comm_rank(MPI_COMM_WORLD, &myRank) ;
+   char config_fti[] = "config.fti";
+   FTI_Init(config_fti, MPI_COMM_WORLD);
+   MPI_Comm_size(FTI_COMM_WORLD, &numRanks) ;
+   MPI_Comm_rank(FTI_COMM_WORLD, &myRank) ;
 #else
    numRanks = 1;
    myRank = 0;
@@ -2755,7 +2786,7 @@ int main(int argc, char *argv[])
    CommSBN(*locDom, 1, &fieldData) ;
 
    // End initialization
-   MPI_Barrier(MPI_COMM_WORLD);
+   MPI_Barrier(FTI_COMM_WORLD);
 #endif   
    
    // BEGIN timestep to solution */
@@ -2766,10 +2797,68 @@ int main(int argc, char *argv[])
    gettimeofday(&start, NULL) ;
 #endif
 //debug to see region sizes
-//   for(Int_t i = 0; i < locDom->numReg(); i++)
-//      std::cout << "region" << i + 1<< "size" << locDom->regElemSize(i) <<std::endl;
-   while((locDom->time() < locDom->stoptime()) && (locDom->cycle() < opts.its)) {
+  // for(Int_t i = 0; i < locDom->numReg(); i++)
+  //    std::cout << "region" << i + 1<< "size" << locDom->regElemSize(i) <<std::endl;
+
+
+//---------------------------------------------------------------------------------------------------------------------//
+
+  //First serialization to get a buffer size  
+  save(locDom);
 
+  //Cast std::stringstream -> char*
+  int buffer_size = 0;
+  char* buffer_locDom_ser;
+  std::string tmp = locDom_ser.str();
+  buffer_size = tmp.size();
+  buffer_size += 1000000; //Add this to handle the dynamic change size of the buffer 
+  buffer_locDom_ser = new char [buffer_size];
+  strcpy(buffer_locDom_ser, tmp.c_str());
+
+  //Checkpoint informations
+  int id = 1;
+  int level = 1;
+  int res;
+
+  FTI_Protect(0, &id, 1, FTI_INTG);
+  FTI_Protect(1, &level, 1, FTI_INTG);
+  FTI_Protect(2, buffer_locDom_ser, buffer_size, FTI_CHAR);
+
+
+  //Restart
+  if(FTI_Status() != 0){
+    if(!myRank)
+      std::cout << "---- Restart ----\n";
+
+    res = FTI_Recover();
+
+    //Update checkpoint information
+    if (res != 0) {
+        exit(1);
+    }
+    else { // Update ckpt. id & level
+        level = (level+1)%5; 
+        id++;
+    }
+
+    //Cast char* to stringstream
+    locDom_ser.str(""); //reset the stringstream
+    locDom_ser.str(buffer_locDom_ser);
+
+    //Deserialization
+    Domain *tmp;
+    tmp = load();
+
+    //Set the used by simulation object
+    delete locDom;
+    locDom = NULL;
+    locDom = tmp;
+  }
+
+//---------------------------------------------------------------------------------------------------------------------//
+   if (!myRank)
+     std::cout << "-- Start of the main loop --\n";
+   while((locDom->time() < locDom->stoptime()) && (locDom->cycle() < opts.its)) {
       TimeIncrement(*locDom) ;
       LagrangeLeapFrog(*locDom) ;
 
@@ -2777,6 +2866,26 @@ int main(int argc, char *argv[])
          printf("cycle = %d, time = %e, dt=%e\n",
                 locDom->cycle(), double(locDom->time()), double(locDom->deltatime()) ) ;
       }
+
+      //Checkpoint at ITER_CKPT
+      if((locDom->cycle()%ITER_CKPT) == 0 && locDom->cycle() != opts.its){
+
+        //Serialization of locDom in std::stringstream
+        locDom_ser.str("");
+        save(locDom);
+
+        //Cast std::stringstream -> char*
+        std::string tmp = locDom_ser.str();
+        buffer_locDom_ser[0] = '\0'; //reset the buffer
+        strcpy(buffer_locDom_ser, tmp.c_str());
+
+        res = FTI_Checkpoint(id, level);
+        // sleep(3); //for the tests
+        if(res != 0){
+          id++;
+          level= (level%4)+1;
+        }
+      }
    }
 
    // Use reduced max elapsed time
@@ -2791,7 +2900,7 @@ int main(int argc, char *argv[])
    double elapsed_timeG;
 #if USE_MPI   
    MPI_Reduce(&elapsed_time, &elapsed_timeG, 1, MPI_DOUBLE,
-              MPI_MAX, 0, MPI_COMM_WORLD);
+              MPI_MAX, 0, FTI_COMM_WORLD);
 #else
    elapsed_timeG = elapsed_time;
 #endif
@@ -2806,6 +2915,7 @@ int main(int argc, char *argv[])
    }
 
 #if USE_MPI
+   FTI_Finalize();
    MPI_Finalize() ;
 #endif
diff --git a/LULESH/lulesh.h b/FTI_LULESH/lulesh.h
index b6afd5c..1ca6a59 100644
--- a/LULESH/lulesh.h
+++ b/FTI_LULESH/lulesh.h
@@ -24,6 +24,16 @@
 #include <math.h>
 #include <vector>
 
+//********************
+// Boost Serialization
+//********************
+#include <boost/serialization/vector.hpp>
+#include <iostream>
+#include <fstream>
+#if _OPENMP
+#include <omp.h>
+#endif
+
 //**************************************************
 // Allow flexibility for arithmetic representations 
 //**************************************************
@@ -133,6 +143,27 @@ class Domain {
           Index_t rowLoc, Index_t planeLoc,
           Index_t nx, Int_t tp, Int_t nr, Int_t balance, Int_t cost);
 
+   Domain ()   :
+   m_e_cut(Real_t(1.0e-7)),
+   m_p_cut(Real_t(1.0e-7)),
+   m_q_cut(Real_t(1.0e-7)),
+   m_v_cut(Real_t(1.0e-10)),
+   m_u_cut(Real_t(1.0e-7)),
+   m_hgcoef(Real_t(3.0)),
+   m_ss4o3(Real_t(4.0)/Real_t(3.0)),
+   m_qstop(Real_t(1.0e+12)),
+   m_monoq_max_slope(Real_t(1.0)),
+   m_monoq_limiter_mult(Real_t(2.0)),
+   m_qlc_monoq(Real_t(0.5)),
+   m_qqc_monoq(Real_t(2.0)/Real_t(3.0)),
+   m_qqc(Real_t(2.0)),
+   m_eosvmax(Real_t(1.0e+9)),
+   m_eosvmin(Real_t(1.0e-9)),
+   m_pmin(Real_t(0.)),
+   m_emin(Real_t(-1.0e+15)),
+   m_dvovmax(Real_t(0.1)),
+   m_refdens(Real_t(1.0)) {};
+
    //
    // ALLOCATION
    //
@@ -423,6 +454,243 @@ class Domain {
    void SetupElementConnectivities(Int_t edgeElems);
    void SetupBoundaryConditions(Int_t edgeElems);
 
+   friend class boost::serialization::access;
+   template <typename Archive>
+   void serialize(Archive &ar, const unsigned int version){
+
+
+      //Check de/serialization
+      // if(Archive::is_loading::value){
+      //    std::cout << "-------------------------\n";
+      //    std::cout << "Start of deserialization.\n";
+      //    std::cout << "-------------------------\n";
+      // }
+      // else {
+      //    std::cout << "-------------------------\n";
+      //    std::cout << "Start of serialization.\n";
+      //    std::cout << "-------------------------\n";
+      // }
+
+      ar & m_x ;  /* coordinates */
+      ar & m_y;
+      ar & m_z;
+
+      ar & m_xd ; /* velocities */
+      ar & m_yd ;
+      ar & m_zd ;
+
+      ar & m_xdd ; /* accelerations */
+      ar & m_ydd ;
+      ar & m_zdd ;
+
+      ar & m_fx ;  /* forces */
+      ar & m_fy ;
+      ar & m_fz ;
+
+      ar & m_nodalMass ;  /* mass */
+
+      ar & m_symmX ;  /* symmetry plane nodesets */
+      ar & m_symmY ;
+      ar & m_symmZ ;
+
+      // Element-centered
+
+      ar & m_numRanks ;
+      ar & m_colLoc ;
+      ar & m_rowLoc ;
+      ar & m_planeLoc ;
+      ar & m_tp ;
+
+      ar & m_sizeX ;
+      ar & m_sizeY ;
+      ar & m_sizeZ ;
+      ar & m_numElem ;
+      ar & m_numNode ;
+
+      ar & m_maxPlaneSize ;
+      ar & m_maxEdgeSize ;
+
+      // Region information
+      ar &  m_numReg ;
+      ar &  m_cost; //imbalance cost
+      
+      if(Archive::is_loading::value){
+         m_regElemSize = new Index_t[m_numReg];
+      }
+      ar & boost::serialization::make_array <Index_t> (m_regElemSize, m_numReg); // Size of region sets
+
+      if(Archive::is_loading::value){
+         m_regNumList = new Index_t[m_numElem];
+      }
+      ar & boost::serialization::make_array <Index_t> (m_regNumList, m_numElem); // Region number per domain element
+
+      if(Archive::is_loading::value){
+         m_regElemlist = new Index_t*[m_numReg];
+         for (int i = 0; i < m_numReg; i++){
+            m_regElemlist[i] = new Index_t[m_regElemSize[i]];
+         }
+      }
+
+      for (int i = 0; i < m_numReg; i++){
+         ar & boost::serialization::make_array <Index_t> (m_regElemlist[i], m_regElemSize[i]);
+      }
+
+      ar &  m_nodelist ;     /* elemToNode connectivity */
+
+      ar & m_lxim ;  /* element connectivity across each face */
+      ar & m_lxip ;
+      ar & m_letam ;
+      ar & m_letap ;
+      ar & m_lzetam ;
+      ar & m_lzetap ;
+
+      ar & m_elemBC ;  /* symmetry/free-surface flags for each elem face */
+
+      ar & m_dxx ;  /* principal strains -- temporary */
+      ar & m_dyy ;
+      ar & m_dzz ;
+
+      ar & m_delv_xi ;    /* velocity gradient -- temporary */
+      ar & m_delv_eta ;
+      ar & m_delv_zeta ;
+
+      ar & m_delx_xi ;    /* coordinate gradient -- temporary */
+      ar & m_delx_eta ;
+      ar & m_delx_zeta ;
+
+      ar & m_e ;   /* energy */
+
+      ar & m_p ;   /* pressure */
+      ar & m_q ;   /* q */
+      ar & m_ql ;  /* linear term for q */
+      ar & m_qq ;  /* quadratic term for q */
+
+      ar & m_v ;     /* relative volume */
+      ar & m_volo ;  /* reference volume */
+      ar & m_vnew ;  /* new relative volume -- temporary */
+      ar & m_delv ;  /* m_vnew - m_v */
+      ar & m_vdov ;  /* volume derivative over volume */
+
+      ar & m_arealg ;  /* characteristic length of an element */
+
+      ar & m_ss ;      /* "sound speed" */
+
+      ar & m_elemMass ;  /* mass */
+
+      // Cutoffs (treat as constants)
+      ar & const_cast<Real_t &>(m_e_cut);
+      ar & const_cast<Real_t &>(m_p_cut);
+      ar & const_cast<Real_t &>(m_q_cut);
+      ar & const_cast<Real_t &>(m_v_cut);
+      ar & const_cast<Real_t &>(m_u_cut);
+
+      // Other constants (usually setable, but hardcoded in this proxy app)
+      ar & const_cast<Real_t &>(m_hgcoef);
+      ar & const_cast<Real_t &>(m_ss4o3);
+      ar & const_cast<Real_t &>(m_qstop);
+      ar & const_cast<Real_t &>(m_monoq_max_slope);
+      ar & const_cast<Real_t &>(m_monoq_limiter_mult);  
+      ar & const_cast<Real_t &>(m_qlc_monoq);
+      ar & const_cast<Real_t &>(m_qqc_monoq);
+      ar & const_cast<Real_t &>(m_qqc);
+      ar & const_cast<Real_t &>(m_eosvmax);
+      ar & const_cast<Real_t &>(m_eosvmin);
+      ar & const_cast<Real_t &>(m_pmin);
+      ar & const_cast<Real_t &>(m_emin);
+      ar & const_cast<Real_t &>(m_dvovmax);
+      ar & const_cast<Real_t &>(m_refdens);
+
+      // Variables to keep track of timestep, simulation time, and cycle
+      ar &  m_dtcourant ;         // courant constraint 
+      ar &  m_dthydro ;           // volume change constraint 
+      ar &   m_cycle ;             // iteration count for simulation 
+      ar &  m_dtfixed ;           // fixed time increment 
+      ar &  m_time ;              // current time 
+      ar &  m_deltatime ;         // variable time increment 
+      ar &  m_deltatimemultlb ;
+      ar &  m_deltatimemultub ;
+      ar &  m_dtmax ;             // maximum allowable time increment 
+      ar &  m_stoptime ;          // end time for simulation 
+
+      // OMP hack 
+      #if _OPENMP
+         Index_t numthreads = omp_get_max_threads();
+      #else
+         Index_t numthreads = 1;
+      #endif
+
+      if (numthreads > 1) {
+         if(Archive::is_loading::value){
+            m_nodeElemStart = new Index_t[m_numNode+1];
+         }
+         ar & boost::serialization::make_array <Index_t> (m_nodeElemStart, m_numNode+1);
+
+         if(Archive::is_loading::value){
+            m_nodeElemCornerList = new Index_t[m_nodeElemStart[m_numNode]];
+         }
+         ar & boost::serialization::make_array <Index_t> (m_nodeElemCornerList, m_nodeElemStart[m_numNode]);
+      } else {
+         m_nodeElemStart = NULL;
+         m_nodeElemCornerList = NULL;
+      }
+
+      // Used in setup
+      ar & m_rowMin;
+      ar & m_rowMax;
+      ar & m_colMin;
+      ar & m_colMax;
+      ar & m_planeMin;
+      ar & m_planeMax; 
+
+     #if USE_MPI   
+     // account for face communication 
+     Index_t comBufSize =
+       (m_rowMin + m_rowMax + m_colMin + m_colMax + m_planeMin + m_planeMax) *
+       m_maxPlaneSize * MAX_FIELDS_PER_MPI_COMM ;
+
+     // account for edge communication 
+     comBufSize +=
+       ((m_rowMin & m_colMin) + (m_rowMin & m_planeMin) + (m_colMin & m_planeMin) +
+        (m_rowMax & m_colMax) + (m_rowMax & m_planeMax) + (m_colMax & m_planeMax) +
+        (m_rowMax & m_colMin) + (m_rowMin & m_planeMax) + (m_colMin & m_planeMax) +
+        (m_rowMin & m_colMax) + (m_rowMax & m_planeMin) + (m_colMax & m_planeMin)) *
+       m_maxEdgeSize * MAX_FIELDS_PER_MPI_COMM ;
+
+     // account for corner communication 
+     // factor of 16 is so each buffer has its own cache line 
+     comBufSize += ((m_rowMin & m_colMin & m_planeMin) +
+          (m_rowMin & m_colMin & m_planeMax) +
+          (m_rowMin & m_colMax & m_planeMin) +
+          (m_rowMin & m_colMax & m_planeMax) +
+          (m_rowMax & m_colMin & m_planeMin) +
+          (m_rowMax & m_colMin & m_planeMax) +
+          (m_rowMax & m_colMax & m_planeMin) +
+          (m_rowMax & m_colMax & m_planeMax)) * CACHE_COHERENCE_PAD_REAL ;
+
+
+      // Communication Work space 
+      if(Archive::is_loading::value){
+         commDataSend = new Real_t[comBufSize];
+         commDataRecv = new Real_t[comBufSize];
+      }
+      ar & boost::serialization::make_array <Real_t> (commDataRecv,comBufSize);
+      ar & boost::serialization::make_array <Real_t> (commDataSend,comBufSize);
+
+      #endif
+
+      //Check de/serialization
+      // if(Archive::is_loading::value){
+      //    std::cout << "-------------------------\n";
+      //    std::cout << "Deserialization finished.\n";
+      //    std::cout << "-------------------------\n";
+      // }
+      // else {
+      //    std::cout << "-------------------------\n";
+      //    std::cout << "Serialization finished.\n";
+      //    std::cout << "-------------------------\n";
+      // }
+   }
+
    //
    // IMPLEMENTATION
    //