From f67588fcfdaa3ce7788310c1b6993a6442c46543 Mon Sep 17 00:00:00 2001
From: Kengo Tomida <tomida@astr.tohoku.ac.jp>
Date: Sat, 31 Aug 2024 23:09:29 +0900
Subject: [PATCH 1/9] First implementation

---
 src/athena.hpp               |  20 ++
 src/mesh/amr_loadbalance.cpp | 350 +++++++++++++++++++++++++++++------
 src/mesh/mesh.hpp            |  15 +-
 src/mesh/mesh_refinement.cpp |  54 ++++++
 src/mesh/mesh_refinement.hpp |  21 +++
 src/multigrid/multigrid.hpp  |  18 --
 6 files changed, 405 insertions(+), 73 deletions(-)

diff --git a/src/athena.hpp b/src/athena.hpp
index 90ca38a27f..bed1600b52 100644
--- a/src/athena.hpp
+++ b/src/athena.hpp
@@ -78,6 +78,26 @@ struct LogicalLocation { // aggregate and POD type
 //! prototype for overloading the comparison operator (defined in meshblock_tree.cpp)
 bool operator==(const LogicalLocation &l1, const LogicalLocation &l2);
 
+
+//! \fn inline std::int64_t rotl(std::int64_t i, int s)
+//  \brief left bit rotation function for 64bit integers (unsafe if s > 64)
+
+inline std::int64_t rotl(std::int64_t i, int s) {
+  return (i << s) | (i >> (64 - s));
+}
+
+
+//! \struct LogicalLocationHash
+//  \brief Hash function object for LogicalLocation
+
+struct LogicalLocationHash {
+ public:
+  std::size_t operator()(const LogicalLocation &l) const {
+    return static_cast<std::size_t>(l.lx1^rotl(l.lx2,21)^rotl(l.lx3,42));
+  }
+};
+
+
 //----------------------------------------------------------------------------------------
 //! \struct RegionSize
 //! \brief physical size and number of cells in a Mesh or a MeshBlock
diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 19364164e6..5f3440b782 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -378,6 +378,8 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) {
     int on = newtoold[n];
     if (newloc[n].level>current_level) // set the current max level
       current_level = newloc[n].level;
+    if (newloc[n].level > loclist[on].level) // refined
+      refined_.push_back(n);
     if (newloc[n].level >= loclist[on].level) { // same or refined
       newcost[n] = costlist[on];
     } else {
@@ -394,9 +396,9 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) {
   int nbs = nslist[Globals::my_rank];
   int nbe = nbs + nblist[Globals::my_rank] - 1;
 
-  int bnx1 = my_blocks(0)->block_size.nx1;
-  int bnx2 = my_blocks(0)->block_size.nx2;
-  int bnx3 = my_blocks(0)->block_size.nx3;
+  const int bnx1 = my_blocks(0)->block_size.nx1;
+  const int bnx2 = my_blocks(0)->block_size.nx2;
+  const int bnx3 = my_blocks(0)->block_size.nx3;
 
 #ifdef MPI_PARALLEL
   // Step 3. count the number of the blocks to be sent / received
@@ -545,6 +547,9 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) {
   } // if (nsend !=0)
 #endif // MPI_PARALLEL
 
+  if (my_blocks(0)->pmr->pvars_fc_.size() > 0)
+    PrepareAndSendFaceFieldCorrection(newloc, ranklist, newrank, nslist, nbtold);
+
   // Step 7. construct a new MeshBlock list (moving the data within the MPI rank)
   AthenaArray<MeshBlock*> newlist;
   newlist.NewAthenaArray(nblist[Globals::my_rank]);
@@ -620,19 +625,36 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) {
       } else { // c2f
         if (ranklist[on] == Globals::my_rank) continue;
         MPI_Wait(&(req_recv[rb_idx]), MPI_STATUS_IGNORE);
-        FinishRecvCoarseToFineAMR(pb, recvbuf[rb_idx]);
+        ReceiveCoarseToFineAMR(pb, recvbuf[rb_idx]);
         rb_idx++;
       }
     }
   }
 #endif
 
+  if (my_blocks(0)->pmr->pvars_fc_.size() > 0)
+    ReceiveAndSetFaceFieldCorrection(newrank);
+
+  // Step 8b. Prolongate MeshBlocks
+  for (int n=nbs; n<=nbe; n++) {
+    int on = newtoold[n];
+    LogicalLocation const &oloc = loclist[on];
+    LogicalLocation const &nloc = newloc[n];
+    if (oloc.level < nloc.level) { // c2f
+      MeshBlock *pb = FindMeshBlock(n);
+      ProlongateMeshBlock(pb);
+    }
+  }
+
   // deallocate arrays
   delete [] loclist;
   delete [] ranklist;
   delete [] costlist;
   delete [] newtoold;
   delete [] oldtonew;
+  refined_.clear();
+  ffc_send_.clear();
+  ffc_recv_.clear();
 #ifdef MPI_PARALLEL
   if (nsend != 0) {
     MPI_Waitall(nsend, req_send, MPI_STATUSES_IGNORE);
@@ -949,8 +971,7 @@ void Mesh::FillSameRankFineToCoarseAMR(MeshBlock* pob, MeshBlock* pmb,
 //----------------------------------------------------------------------------------------
 //! \fn void Mesh::FillSameRankCoarseToFineAMR(MeshBlock* pob, MeshBlock* pmb,
 //!                                            LogicalLocation &newloc)
-//! \brief step 7: c2f, same MPI rank, different level
-//!        (just copy+prolongate, no pack/send)
+//! \brief step 7: c2f, same MPI rank, different level (just copy, no pack/send)
 
 void Mesh::FillSameRankCoarseToFineAMR(MeshBlock* pob, MeshBlock* pmb,
                                        LogicalLocation &newloc) {
@@ -979,9 +1000,6 @@ void Mesh::FillSameRankCoarseToFineAMR(MeshBlock* pob, MeshBlock* pmb,
         }
       }
     }
-    pmr->ProlongateCellCenteredValues(
-        dst, *var_cc, 0, nu,
-        pob->cis, pob->cie, pob->cjs, pob->cje, pob->cks, pob->cke);
     pob_cc_it++;
   }
 
@@ -1003,9 +1021,6 @@ void Mesh::FillSameRankCoarseToFineAMR(MeshBlock* pob, MeshBlock* pmb,
         }
       }
     }
-    pmr->ProlongateCellCenteredValues(
-        dst, var_cc, -1, 0, nu,
-        pob->cis, pob->cie, pob->cjs, pob->cje, pob->cks, pob->cke);
   }
 
   auto pob_fc_it = pob->pmr->pvars_fc_.begin();
@@ -1034,18 +1049,6 @@ void Mesh::FillSameRankCoarseToFineAMR(MeshBlock* pob, MeshBlock* pmb,
           dst_b.x3f(k, j, i) = src_b.x3f(ck, cj, ci);
       }
     }
-    pmr->ProlongateSharedFieldX1(
-        dst_b.x1f, (*var_fc).x1f,
-        pob->cis, pob->cie+1, pob->cjs, pob->cje, pob->cks, pob->cke);
-    pmr->ProlongateSharedFieldX2(
-        dst_b.x2f, (*var_fc).x2f,
-        pob->cis, pob->cie, pob->cjs, pob->cje+f2, pob->cks, pob->cke);
-    pmr->ProlongateSharedFieldX3(
-        dst_b.x3f, (*var_fc).x3f,
-        pob->cis, pob->cie, pob->cjs, pob->cje, pob->cks, pob->cke+f3);
-    pmr->ProlongateInternalField(
-        *var_fc, pob->cis, pob->cie,
-        pob->cjs, pob->cje, pob->cks, pob->cke);
     pob_fc_it++;
   }
   return;
@@ -1055,6 +1058,7 @@ void Mesh::FillSameRankCoarseToFineAMR(MeshBlock* pob, MeshBlock* pmb,
 //! \fn void Mesh::FinishRecvSameLevel(MeshBlock *pb, Real *recvbuf)
 //! \brief step 8 (receive and load), branch 1 (same2same: unpack)
 void Mesh::FinishRecvSameLevel(MeshBlock *pb, Real *recvbuf) {
+  MeshRefinement *pmr = pb->pmr;
   int p = 0;
   for (AthenaArray<Real> &var_cc : pb->vars_cc_) {
     int nu = var_cc.GetDim4() - 1;
@@ -1091,7 +1095,7 @@ void Mesh::FinishRecvSameLevel(MeshBlock *pb, Real *recvbuf) {
   //!   appended integer from received buffer is slightly unsafe
   if (adaptive) {
     int *dcp = reinterpret_cast<int *>(&(recvbuf[p]));
-    pb->pmr->deref_count_ = *dcp;
+    pmr->deref_count_ = *dcp;
   }
   return;
 }
@@ -1103,6 +1107,7 @@ void Mesh::FinishRecvSameLevel(MeshBlock *pb, Real *recvbuf) {
 
 void Mesh::FinishRecvFineToCoarseAMR(MeshBlock *pb, Real *recvbuf,
                                      LogicalLocation &lloc) {
+  MeshRefinement *pmr = pb->pmr;
   int ox1 = ((lloc.lx1 & 1LL) == 1LL), ox2 = ((lloc.lx2 & 1LL) == 1LL),
       ox3 = ((lloc.lx3 & 1LL) == 1LL);
   int p = 0, il, iu, jl, ju, kl, ku;
@@ -1113,7 +1118,7 @@ void Mesh::FinishRecvFineToCoarseAMR(MeshBlock *pb, Real *recvbuf,
   if (ox3 == 0) kl = pb->ks,            ku = pb->ks + pb->block_size.nx3/2 - f3;
   else        kl = pb->ks + pb->block_size.nx3/2, ku = pb->ke;
 
-  for (auto cc_pair : pb->pmr->pvars_cc_) {
+  for (auto cc_pair : pmr->pvars_cc_) {
     AthenaArray<Real> *var_cc = std::get<0>(cc_pair);
     int nu = var_cc->GetDim4() - 1;
     BufferUtility::UnpackData(recvbuf, *var_cc, 0, nu,
@@ -1126,7 +1131,7 @@ void Mesh::FinishRecvFineToCoarseAMR(MeshBlock *pb, Real *recvbuf,
                               il, iu, jl, ju, p);
   }
 
-  for (auto fc_pair : pb->pmr->pvars_fc_) {
+  for (auto fc_pair : pmr->pvars_fc_) {
     FaceField *var_fc = std::get<0>(fc_pair);
     FaceField &dst_b = *var_fc;
     BufferUtility::UnpackData(recvbuf, dst_b.x1f,
@@ -1150,41 +1155,31 @@ void Mesh::FinishRecvFineToCoarseAMR(MeshBlock *pb, Real *recvbuf,
 }
 
 //----------------------------------------------------------------------------------------
-//! \fn void Mesh::FinishRecvCoarseToFineAMR(MeshBlock *pb, Real *recvbuf)
-//! \brief step 8 (receive and load), branch 2 (c2f: unpack+prolongate)
+//! \fn void Mesh::ReceiveCoarseToFineAMR(MeshBlock *pb, Real *recvbuf)
+//! \brief step 8, branch 2 (c2f: receive and unpack)
 
-void Mesh::FinishRecvCoarseToFineAMR(MeshBlock *pb, Real *recvbuf) {
+void Mesh::ReceiveCoarseToFineAMR(MeshBlock *pb, Real *recvbuf) {
   MeshRefinement *pmr = pb->pmr;
   int p = 0;
   int il = pb->cis - 1, iu = pb->cie+1, jl = pb->cjs - f2,
       ju = pb->cje + f2, kl = pb->cks - f3, ku = pb->cke + f3;
-  for (auto cc_pair : pb->pmr->pvars_cc_) {
+  for (auto cc_pair : pmr->pvars_cc_) {
     AthenaArray<Real> *var_cc = std::get<0>(cc_pair);
     AthenaArray<Real> *coarse_cc = std::get<1>(cc_pair);
     int nu = var_cc->GetDim4() - 1;
     BufferUtility::UnpackData(recvbuf, *coarse_cc,
                               0, nu, il, iu, jl, ju, kl, ku, p);
-    pmr->ProlongateCellCenteredValues(
-        *coarse_cc, *var_cc, 0, nu,
-        pb->cis, pb->cie, pb->cjs, pb->cje, pb->cks, pb->cke);
   }
 
-
   if ((NR_RADIATION_ENABLED|| IM_RADIATION_ENABLED)) {
-    // copy from pmb block
     AthenaArray<Real> &var_cc = pb->pnrrad->ir;
     AthenaArray<Real> &coarse_cc = pb->pnrrad->coarse_ir_;
     int nu = var_cc.GetDim1() - 1;
-
     BufferUtility::UnpackData(recvbuf, coarse_cc, kl, ku,
                               0, nu, il, iu, jl, ju, p);
-
-    pmr->ProlongateCellCenteredValues(
-        coarse_cc, var_cc, -1, 0, nu,
-        pb->cis, pb->cie, pb->cjs, pb->cje, pb->cks, pb->cke);
   }
 
-  for (auto fc_pair : pb->pmr->pvars_fc_) {
+  for (auto fc_pair : pmr->pvars_fc_) {
     FaceField *var_fc = std::get<0>(fc_pair);
     FaceField *coarse_fc = std::get<1>(fc_pair);
 
@@ -1194,22 +1189,253 @@ void Mesh::FinishRecvCoarseToFineAMR(MeshBlock *pb, Real *recvbuf) {
                               il, iu, jl, ju+f2, kl, ku, p);
     BufferUtility::UnpackData(recvbuf, (*coarse_fc).x3f,
                               il, iu, jl, ju, kl, ku+f3, p);
-    pmr->ProlongateSharedFieldX1(
-        (*coarse_fc).x1f, (*var_fc).x1f,
-        pb->cis, pb->cie+1, pb->cjs, pb->cje, pb->cks, pb->cke);
-    pmr->ProlongateSharedFieldX2(
-        (*coarse_fc).x2f, (*var_fc).x2f,
-        pb->cis, pb->cie, pb->cjs, pb->cje+f2, pb->cks, pb->cke);
-    pmr->ProlongateSharedFieldX3(
-        (*coarse_fc).x3f, (*var_fc).x3f,
-        pb->cis, pb->cie, pb->cjs, pb->cje, pb->cks, pb->cke+f3);
-    pmr->ProlongateInternalField(
-        *var_fc, pb->cis, pb->cie,
-        pb->cjs, pb->cje, pb->cks, pb->cke);
   }
   return;
 }
 
+//----------------------------------------------------------------------------------------
+//! \fn void Mesh::ProlongateMeshBlock(MeshBlock *pb)
+//! \brief step 8, branch 2 (c2f: prolongate)
+
+void Mesh::ProlongateMeshBlock(MeshBlock *pb) {
+  MeshRefinement *pmr = pb->pmr;
+  for (auto cc_pair : pmr->pvars_cc_) {
+    AthenaArray<Real> *var_cc = std::get<0>(cc_pair);
+    AthenaArray<Real> *coarse_cc = std::get<1>(cc_pair);
+    int nu = var_cc->GetDim4() - 1;
+    pmr->ProlongateCellCenteredValues(*coarse_cc, *var_cc, 0, nu,
+                   pb->cis, pb->cie, pb->cjs, pb->cje, pb->cks, pb->cke);
+  }
+
+
+  if ((NR_RADIATION_ENABLED|| IM_RADIATION_ENABLED)) {
+    // copy from pmb block
+    AthenaArray<Real> &var_cc = pb->pnrrad->ir;
+    AthenaArray<Real> &coarse_cc = pb->pnrrad->coarse_ir_;
+    int nu = var_cc.GetDim1() - 1;
+
+    pmr->ProlongateCellCenteredValues(coarse_cc, var_cc, -1, 0, nu,
+                   pb->cis, pb->cie, pb->cjs, pb->cje, pb->cks, pb->cke);
+  }
+
+  int il = pb->cis, iu = pb->cie+1, jl = pb->cjs, ju = pb->cje + f2,
+      kl = pb->cks, ku = pb->cke + f3;
+  // Step FFC6. skip the surface fields contacting previously refined cells
+  if (pmr->flag_ffc_recv_[BoundaryFace::inner_x1]) il++;
+  if (pmr->flag_ffc_recv_[BoundaryFace::outer_x1]) iu--;
+  if (pmr->flag_ffc_recv_[BoundaryFace::inner_x2]) jl++;
+  if (pmr->flag_ffc_recv_[BoundaryFace::outer_x2]) ju--;
+  if (pmr->flag_ffc_recv_[BoundaryFace::inner_x3]) kl++;
+  if (pmr->flag_ffc_recv_[BoundaryFace::outer_x3]) ku--;
+
+  for (auto fc_pair : pmr->pvars_fc_) {
+    FaceField *var_fc = std::get<0>(fc_pair);
+    FaceField *coarse_fc = std::get<1>(fc_pair);
+
+    pmr->ProlongateSharedFieldX1((*coarse_fc).x1f, (*var_fc).x1f,
+                                 il, iu, pb->cjs, pb->cje, pb->cks, pb->cke);
+    pmr->ProlongateSharedFieldX2((*coarse_fc).x2f, (*var_fc).x2f,
+                                 pb->cis, pb->cie, jl, ju, pb->cks, pb->cke);
+    pmr->ProlongateSharedFieldX3((*coarse_fc).x3f, (*var_fc).x3f,
+                                 pb->cis, pb->cie, pb->cjs, pb->cje, kl, ku);
+    pmr->ProlongateInternalField(*var_fc, pb->cis, pb->cie,
+                                 pb->cjs, pb->cje, pb->cks, pb->cke);
+  }
+  return;
+}
+
+
+//----------------------------------------------------------------------------------------
+//! \fn void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
+//!                              int *ranklist, int *newrank, int *nslist, int nbtold)
+//! \brief the first part of the face field correction
+
+void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
+                                  int *ranklist, int *newrank, int *nslist, int nbtold) {
+  const int bnx1 = my_blocks(0)->block_size.nx1;
+  const int bnx2 = my_blocks(0)->block_size.nx2;
+  const int bnx3 = my_blocks(0)->block_size.nx3;
+  const int nf = my_blocks(0)->pmr->pvars_fc_.size();
+  const int sffc1 = nf*bnx2*bnx3, sffc2 = nf*bnx1*bnx3, sffc3 = nf*bnx2*bnx3;
+  // Step FFC1. Find pairs that need face field correction
+  int s = 0;
+  for (int n : refined_) { // loop over refined MeshBlocks
+    LogicalLocation const &nloc = newloc[n];
+    int xf1 = static_cast<int>(nloc.lx1 & 1LL);
+    int xf2 = static_cast<int>(nloc.lx2 & 1LL);
+    int xf3 = static_cast<int>(nloc.lx3 & 1LL);
+    std::int64_t o1, o2, o3;
+    o1 = (xf1 << 1) - 1;
+    o2 = (xf2 << 1) - 1;
+    o3 = (xf3 << 1) - 1;
+    // *** KT: This loop over all the current MeshBlcks is a naive implementation.
+    // The MeshBlockTree is already updated at this point, so we have to search loclist.
+    // Perhaps we should use std::unordered_map if this part costs too much.
+    // We already have such an example in MultigridDriver (octet map).
+    for (int i = 0; i < nbtold; ++i) {
+      LogicalLocation const &floc = loclist[i];
+      if (nloc.level == floc.level) {
+        if (nloc.lx1 + o1 == floc.lx1 && nloc.lx2 == floc.lx2 && nloc.lx3 == floc.lx3) {
+          if (ranklist[i] == Globals::my_rank) {
+            ffc_send_.emplace_back(i, n, xf1^1, sffc1, -1);
+            if (newrank[n] == Globals::my_rank)
+              ffc_recv_.emplace_back(i, n, xf1, 0, s);
+            s++;
+          } else if (newrank[n] == Globals::my_rank) {
+            ffc_recv_.emplace_back(i, n, xf1, sffc1, -1);
+          }
+        }
+        if (f2) {
+          if (nloc.lx1 == floc.lx1 && nloc.lx2 + o2 == floc.lx2 && nloc.lx3 == floc.lx3) {
+            if (ranklist[i] == Globals::my_rank) {
+              ffc_send_.emplace_back(i, n, xf2^1 + 2, sffc2, -1);
+              if (newrank[n] == Globals::my_rank)
+                ffc_recv_.emplace_back(i, n, xf2 + 2, 0, s);
+              s++;
+            } else if (newrank[n] == Globals::my_rank) {
+              ffc_recv_.emplace_back(i, n, xf2 + 2, sffc2, -1);
+            }
+          }
+        }
+        if (f3) {
+          if (nloc.lx1 == floc.lx1 && nloc.lx2 == floc.lx2 && nloc.lx3 + o3 == floc.lx3) {
+            if (ranklist[i] == Globals::my_rank) {
+              ffc_send_.emplace_back(i, n, xf3^1 + 4, sffc3, -1);
+              if (newrank[n] == Globals::my_rank)
+                ffc_recv_.emplace_back(i, n, xf3 + 4, 0, s);
+              s++;
+            } else if (newrank[n] == Globals::my_rank) {
+              ffc_recv_.emplace_back(i, n, xf3 + 4, sffc3, -1);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Step FFC2. Initiate MPI receive
+#ifdef MPI_PARALLEL
+  for (FaceFieldCorrection &t : ffc_recv_) {
+    if (ranklist[t.from] != Globals::my_rank) {
+      int tag = CreateFaceFieldCorrectionMPITag(t.to-nslist[newrank[t.to]], t.face);
+      MPI_Irecv(t.buf, t.size, MPI_ATHENA_REAL, ranklist[t.from],
+                tag, MPI_COMM_WORLD, &(t.req));
+    }
+  }
+#endif
+
+  // Step FFC3. Pack and send shared face fields
+  for (FaceFieldCorrection &f : ffc_send_) {
+    MeshBlock *pmb = FindMeshBlock(f.from);
+    int p = 0;
+    for (FaceField &var_fc : pmb->vars_fc_) {
+      switch (f.face) {
+        case BoundaryFace::inner_x1:
+          BufferUtility::PackData(var_fc.x1f, f.buf, pmb->is, pmb->is,
+                                  pmb->js, pmb->je, pmb->ks, pmb->ke, p);
+          break;
+        case BoundaryFace::outer_x1:
+          BufferUtility::PackData(var_fc.x1f, f.buf, pmb->ie+1, pmb->ie+1,
+                                  pmb->js, pmb->je, pmb->ks, pmb->ke, p);
+          break;
+        case BoundaryFace::inner_x2:
+          BufferUtility::PackData(var_fc.x2f, f.buf, pmb->is, pmb->ie,
+                                  pmb->js, pmb->js, pmb->ks, pmb->ke, p);
+          break;
+        case BoundaryFace::outer_x2:
+          BufferUtility::PackData(var_fc.x2f, f.buf, pmb->is, pmb->ie,
+                                  pmb->je+f2, pmb->je+f2, pmb->ks, pmb->ke, p);
+          break;
+        case BoundaryFace::inner_x3:
+          BufferUtility::PackData(var_fc.x3f, f.buf, pmb->is, pmb->ie,
+                                  pmb->js, pmb->je, pmb->ks, pmb->ks, p);
+          break;
+        case BoundaryFace::outer_x3:
+          BufferUtility::PackData(var_fc.x3f, f.buf, pmb->is, pmb->ie,
+                                  pmb->js, pmb->je, pmb->ke+f3, pmb->ke+f3, p);
+          break;
+        default:
+          break;
+      }
+    }
+#ifdef MPI_PARALLEL
+    if (newrank[f.to] != Globals::my_rank) {
+      int face = f.face^1;
+      int tag = CreateFaceFieldCorrectionMPITag(f.to-nslist[newrank[f.to]], face);
+      MPI_Isend(f.buf, f.size, MPI_ATHENA_REAL, newrank[f.to],
+                tag, MPI_COMM_WORLD, &(f.req));
+    }
+#endif
+  }
+}
+
+
+//----------------------------------------------------------------------------------------
+//! \fn void Mesh::ReceiveAndSetFaceFieldCorrection(int *newrank)
+//! \brief the second part of the face field correction
+
+void Mesh::ReceiveAndSetFaceFieldCorrection(int *newrank) {
+  // Step FFC4. clear face field correction flags
+  for (int n = 0; n < nblocal; ++n) {
+    MeshRefinement *pmr = my_blocks(n)->pmr;
+    for (int i = 0; i < 6; ++i)
+      pmr->flag_ffc_recv_[i] = false;
+  }
+
+  // Step FFC5. wait and unpack/set
+  for (FaceFieldCorrection &t : ffc_recv_) {
+    MeshBlock *pmb = FindMeshBlock(t.to);
+    pmb->pmr->flag_ffc_recv_[t.face] = true;
+    Real *buf = t.buf;
+    if (ranklist[t.from] == Globals::my_rank) // local copy
+      buf = ffc_send_[t.src].buf;
+#ifdef MPI_PARALLEL
+    else // MPI
+      MPI_Wait(&(t.req), MPI_STATUS_IGNORE);
+#endif
+    int p = 0;
+    for (FaceField &var_fc : pmb->vars_fc_) {
+      switch (t.face) {
+        case BoundaryFace::inner_x1:
+          BufferUtility::UnpackData(buf, var_fc.x1f, pmb->is, pmb->is,
+                                    pmb->js, pmb->je, pmb->ks, pmb->ke, p);
+          break;
+        case BoundaryFace::outer_x1:
+          BufferUtility::UnpackData(buf, var_fc.x1f, pmb->ie+1, pmb->ie+1,
+                                    pmb->js, pmb->je, pmb->ks, pmb->ke, p);
+          break;
+        case BoundaryFace::inner_x2:
+          BufferUtility::UnpackData(buf, var_fc.x2f, pmb->is, pmb->ie,
+                                    pmb->js, pmb->js, pmb->ks, pmb->ke, p);
+          break;
+        case BoundaryFace::outer_x2:
+          BufferUtility::UnpackData(buf, var_fc.x2f, pmb->is, pmb->ie,
+                                    pmb->je+f2, pmb->je+f2, pmb->ks, pmb->ke, p);
+          break;
+        case BoundaryFace::inner_x3:
+          BufferUtility::UnpackData(buf, var_fc.x3f, pmb->is, pmb->ie,
+                                    pmb->js, pmb->je, pmb->ks, pmb->ks, p);
+          break;
+        case BoundaryFace::outer_x3:
+          BufferUtility::UnpackData(buf, var_fc.x3f, pmb->is, pmb->ie,
+                                    pmb->js, pmb->je, pmb->ke+f3, pmb->ke+f3, p);
+          break;
+        default:
+          break;
+      }
+    }
+  }
+
+  // Step FFC6. Finalize MPI send
+#ifdef MPI_PARALLEL
+  for (FaceFieldCorrection &f : ffc_send_) {
+    if (newrank[f.to] != Globals::my_rank) {
+      MPI_Wait(&(f.req), MPI_STATUS_IGNORE);
+    }
+  }
+#endif
+}
+
 //----------------------------------------------------------------------------------------
 //! \fn int Mesh::CreateAMRMPITag(int lid, int ox1, int ox2, int ox3)
 //! \brief calculate an MPI tag for AMR block transfer
@@ -1223,3 +1449,19 @@ int Mesh::CreateAMRMPITag(int lid, int ox1, int ox2, int ox3) {
   // former "AthenaTagMPI" AthenaTagMPI::amr=8 redefined to 0
   return (lid<<8) | (ox1<<7)| (ox2<<6) | (ox3<<5);
 }
+
+//----------------------------------------------------------------------------------------
+//! \fn int Mesh::CreateFaceFieldCorrectionMPITag(int lid, int face)
+//! \brief calculate an MPI tag for face field correction
+//!
+//! tag = local id of destination (remaining bits) + ox1(1 bit) + ox2(1 bit) + ox3(1 bit)
+//!       + physics(5 bits)
+//!
+
+int Mesh::CreateFaceFieldCorrectionMPITag(int lid, int face) {
+  // Set the bottom bit = 1
+  // It should be OK as this communication does not overlap with te main integrator,
+  // and all we need is to distinguish FFC communications from AMR communications.
+  return (lid<<8) | (face << 5) | 1;
+}
+
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 812b702cee..427182057b 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -16,6 +16,7 @@
 // C++ headers
 #include <cstdint>     // int64_t
 #include <functional>  // reference_wrapper
+#include <unordered_map>
 #include <string>
 #include <vector>
 
@@ -341,6 +342,11 @@ class Mesh {
   double lb_tolerance_;
   int lb_interval_;
 
+  // for AMR face field correction
+  std::unordered_map<LogicalLocation, int, LogicalLocationHash> *locmap_;
+  std::vector<int> refined_;
+  std::vector<FaceFieldCorrection> ffc_send_, ffc_recv_;
+
   // functions
   MeshGenFunc MeshGenerator_[3];
   BValFunc BoundaryFunction_[6];
@@ -392,7 +398,14 @@ class Mesh {
   // step 8: receive
   void FinishRecvSameLevel(MeshBlock *pb, Real *recvbuf);
   void FinishRecvFineToCoarseAMR(MeshBlock *pb, Real *recvbuf, LogicalLocation &lloc);
-  void FinishRecvCoarseToFineAMR(MeshBlock *pb, Real *recvbuf);
+  void ReceiveCoarseToFineAMR(MeshBlock *pb, Real *recvbuf);
+  void ProlongateMeshBlock(MeshBlock *pb);
+
+  // Face field correction
+  void PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
+                              int *ranklist, int *newrank, int *nslist, int nbtold);
+  void ReceiveAndSetFaceFieldCorrection(int *newrank);
+  int CreateFaceFieldCorrectionMPITag(int lid, int face);
 
   //! defined in either the prob file or default_pgen.cpp in ../pgen/
   void InitUserMeshData(ParameterInput *pin);
diff --git a/src/mesh/mesh_refinement.cpp b/src/mesh/mesh_refinement.cpp
index 7c402995ee..89aacf9678 100644
--- a/src/mesh/mesh_refinement.cpp
+++ b/src/mesh/mesh_refinement.cpp
@@ -1504,3 +1504,57 @@ void MeshRefinement::SetHydroRefinement(HydroBoundaryQuantity hydro_type) {
   }
   return;
 }
+
+
+// Constructor
+FaceFieldCorrection::FaceFieldCorrection(int ifrom, int ito, int iface, int isize,
+      int isrc) : from(ifrom), to(ito), face(iface), size(isize), src(isrc) {
+  if (size == 0)
+    buf = nullptr;
+  else
+    buf = new Real[size];
+#ifdef MPI_PARALLEL
+  req = MPI_REQUEST_NULL;
+#endif
+}
+
+
+// Copy constructor
+FaceFieldCorrection::FaceFieldCorrection(const FaceFieldCorrection& c) {
+  from = c.from, to = c.to, face = c.face, size = c.size, src = c.src;
+  if (size == 0)
+    buf = nullptr;
+  else
+    buf = new Real[size];
+  for (int i = 0; i < size; ++i)
+    buf[i] = c.buf[i];
+#ifdef MPI_PARALLEL
+  req = c.req;
+#endif
+}
+
+
+// Move constructor
+FaceFieldCorrection::FaceFieldCorrection(FaceFieldCorrection&& c) {
+  from = c.from, to = c.to, face = c.face, size = c.size, src = c.src;
+  buf = c.buf;
+  c.buf = nullptr;
+#ifdef MPI_PARALLEL
+  req = c.req;
+#endif
+}
+
+
+// Copy assignment operator
+FaceFieldCorrection& FaceFieldCorrection::operator=(const FaceFieldCorrection &c) {
+  if (this != &c) {
+    from = c.from, to = c.to, face = c.face, size = c.size, src = c.src;
+    for (int i = 0; i < size; ++i)
+      buf[i] = c.buf[i];
+  }
+#ifdef MPI_PARALLEL
+  req = c.req;
+#endif
+  return *this;
+}
+
diff --git a/src/mesh/mesh_refinement.hpp b/src/mesh/mesh_refinement.hpp
index 208b41340e..feee602e9b 100644
--- a/src/mesh/mesh_refinement.hpp
+++ b/src/mesh/mesh_refinement.hpp
@@ -107,6 +107,27 @@ class MeshRefinement {
   // tuples of references to AMR-enrolled arrays (quantity, coarse_quantity)
   std::vector<std::tuple<AthenaArray<Real> *, AthenaArray<Real> *>> pvars_cc_;
   std::vector<std::tuple<FaceField *, FaceField *>> pvars_fc_;
+
+  bool flag_ffc_recv_[6];
+};
+
+
+//----------------------------------------------------------------------------------------
+//! \struct FaceFieldCorrection
+//! \brief
+
+struct FaceFieldCorrection {
+  int from, to, face, size, src;
+  Real *buf;
+#ifdef MPI_PARALLEL
+  MPI_Request req;
+#endif
+
+  FaceFieldCorrection(int ifrom, int ito, int iface, int isize, int isrc);
+  ~FaceFieldCorrection() { delete [] buf; }
+  FaceFieldCorrection(const FaceFieldCorrection& c);
+  FaceFieldCorrection(FaceFieldCorrection&& c);
+  FaceFieldCorrection& operator=(const FaceFieldCorrection &c);
 };
 
 #endif // MESH_MESH_REFINEMENT_HPP_
diff --git a/src/multigrid/multigrid.hpp b/src/multigrid/multigrid.hpp
index 8274f8797a..401beacf58 100644
--- a/src/multigrid/multigrid.hpp
+++ b/src/multigrid/multigrid.hpp
@@ -40,24 +40,6 @@ enum class MGNormType {max, l1, l2};
 
 constexpr int minth_ = 8;
 
-//! \fn inline std::int64_t rotl(std::int64_t i, int s)
-//  \brief left bit rotation function for 64bit integers (unsafe if s > 64)
-
-inline std::int64_t rotl(std::int64_t i, int s) {
-  return (i << s) | (i >> (64 - s));
-}
-
-
-//! \struct LogicalLocationHash
-//  \brief Hash function object for LogicalLocation
-
-struct LogicalLocationHash {
- public:
-  std::size_t operator()(const LogicalLocation &l) const {
-    return static_cast<std::size_t>(l.lx1^rotl(l.lx2,21)^rotl(l.lx3,42));
-  }
-};
-
 
 //! \class MGCoordinates
 //  \brief Minimum set of coordinate arrays for Multigrid

From 339840b9ef33d8e4b92e1ef09bea6641fb383fe9 Mon Sep 17 00:00:00 2001
From: Kengo Tomida <tomida@astr.tohoku.ac.jp>
Date: Sun, 1 Sep 2024 19:17:05 +0900
Subject: [PATCH 2/9] Use unordered_map, support periodic BC

---
 src/mesh/amr_loadbalance.cpp | 134 +++++++++++++++++++++++++----------
 src/mesh/mesh.cpp            |   7 ++
 2 files changed, 105 insertions(+), 36 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 5f3440b782..6de13550bb 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -652,9 +652,14 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) {
   delete [] costlist;
   delete [] newtoold;
   delete [] oldtonew;
-  refined_.clear();
-  ffc_send_.clear();
-  ffc_recv_.clear();
+  if (my_blocks(0)->pmr->pvars_fc_.size() > 0) {
+    refined_.clear();
+    ffc_send_.clear();
+    ffc_recv_.clear();
+    for (int l = 0; l < max_level - root_level - 1; ++l)
+      locmap_[l].clear();
+  }
+
 #ifdef MPI_PARALLEL
   if (nsend != 0) {
     MPI_Waitall(nsend, req_send, MPI_STATUSES_IGNORE);
@@ -1257,6 +1262,16 @@ void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
   const int bnx3 = my_blocks(0)->block_size.nx3;
   const int nf = my_blocks(0)->pmr->pvars_fc_.size();
   const int sffc1 = nf*bnx2*bnx3, sffc2 = nf*bnx1*bnx3, sffc3 = nf*bnx2*bnx3;
+
+  // Step FFC1. Construct a hash map of MeshBlocks before refinement
+  for (int i = 0; i < nbtold; ++i) {
+    LogicalLocation const &loc = loclist[i];
+    if (loc.level > root_level) {
+      int lev = loc.level - root_level - 1;
+      locmap_[lev][loc] = i;
+    }
+  }
+
   // Step FFC1. Find pairs that need face field correction
   int s = 0;
   for (int n : refined_) { // loop over refined MeshBlocks
@@ -1268,45 +1283,92 @@ void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
     o1 = (xf1 << 1) - 1;
     o2 = (xf2 << 1) - 1;
     o3 = (xf3 << 1) - 1;
-    // *** KT: This loop over all the current MeshBlcks is a naive implementation.
-    // The MeshBlockTree is already updated at this point, so we have to search loclist.
-    // Perhaps we should use std::unordered_map if this part costs too much.
-    // We already have such an example in MultigridDriver (octet map).
-    for (int i = 0; i < nbtold; ++i) {
-      LogicalLocation const &floc = loclist[i];
-      if (nloc.level == floc.level) {
-        if (nloc.lx1 + o1 == floc.lx1 && nloc.lx2 == floc.lx2 && nloc.lx3 == floc.lx3) {
+    LogicalLocation floc = nloc;
+    int lev = floc.level - root_level - 1;
+    floc.lx1 += o1;
+    bool fskip = false;
+    if (floc.lx1 < 0) {
+      if (mesh_bcs[BoundaryFace::inner_x1] == BoundaryFlag::periodic)
+        floc.lx1 = (nrbx1 << (lev+1)) - 1;
+      else
+        fskip = true;
+    } else if (floc.lx1 >= (nrbx1 << (lev+1))) {
+      if (mesh_bcs[BoundaryFace::outer_x1] == BoundaryFlag::periodic)
+        floc.lx1 = 0;
+      else
+        fskip = true;
+    }
+    if (!fskip) {
+      auto itr = locmap_[lev].find(floc);
+      if (itr != locmap_[lev].end()) {
+        const auto& it = *itr;
+        int i = it.second;
+        if (ranklist[i] == Globals::my_rank) {
+          ffc_send_.emplace_back(i, n, xf1^1, sffc1, -1);
+          if (newrank[n] == Globals::my_rank)
+            ffc_recv_.emplace_back(i, n, xf1, 0, s);
+          s++;
+        } else if (newrank[n] == Globals::my_rank) {
+          ffc_recv_.emplace_back(i, n, xf1, sffc1, -1);
+        }
+      }
+    }
+    if (f2) {
+      floc.lx1 = nloc.lx1, floc.lx2 += o2;
+      fskip = false;
+      if (floc.lx2 < 0) {
+        if (mesh_bcs[BoundaryFace::inner_x2] == BoundaryFlag::periodic)
+          floc.lx2 = (nrbx2 << (lev+1)) - 1;
+        else
+          fskip = true;
+      } else if (floc.lx2 >= (nrbx2 << (lev+1))) {
+        if (mesh_bcs[BoundaryFace::outer_x2] == BoundaryFlag::periodic)
+          floc.lx2 = 0;
+        else
+          fskip = true;
+      }
+      if (!fskip) {
+        auto itr = locmap_[lev].find(floc);
+        if (itr != locmap_[lev].end()) {
+          const auto& it = *itr;
+          int i = it.second;
           if (ranklist[i] == Globals::my_rank) {
-            ffc_send_.emplace_back(i, n, xf1^1, sffc1, -1);
+            ffc_send_.emplace_back(i, n, xf2^1 + 2, sffc2, -1);
             if (newrank[n] == Globals::my_rank)
-              ffc_recv_.emplace_back(i, n, xf1, 0, s);
+              ffc_recv_.emplace_back(i, n, xf2 + 2, 0, s);
             s++;
           } else if (newrank[n] == Globals::my_rank) {
-            ffc_recv_.emplace_back(i, n, xf1, sffc1, -1);
+            ffc_recv_.emplace_back(i, n, xf2 + 2, sffc2, -1);
           }
         }
-        if (f2) {
-          if (nloc.lx1 == floc.lx1 && nloc.lx2 + o2 == floc.lx2 && nloc.lx3 == floc.lx3) {
-            if (ranklist[i] == Globals::my_rank) {
-              ffc_send_.emplace_back(i, n, xf2^1 + 2, sffc2, -1);
-              if (newrank[n] == Globals::my_rank)
-                ffc_recv_.emplace_back(i, n, xf2 + 2, 0, s);
-              s++;
-            } else if (newrank[n] == Globals::my_rank) {
-              ffc_recv_.emplace_back(i, n, xf2 + 2, sffc2, -1);
-            }
-          }
-        }
-        if (f3) {
-          if (nloc.lx1 == floc.lx1 && nloc.lx2 == floc.lx2 && nloc.lx3 + o3 == floc.lx3) {
-            if (ranklist[i] == Globals::my_rank) {
-              ffc_send_.emplace_back(i, n, xf3^1 + 4, sffc3, -1);
-              if (newrank[n] == Globals::my_rank)
-                ffc_recv_.emplace_back(i, n, xf3 + 4, 0, s);
-              s++;
-            } else if (newrank[n] == Globals::my_rank) {
-              ffc_recv_.emplace_back(i, n, xf3 + 4, sffc3, -1);
-            }
+      }
+    }
+    if (f3) {
+      floc.lx2 = nloc.lx2, floc.lx3 += o3;
+      fskip = false;
+      if (floc.lx3 < 0) {
+        if (mesh_bcs[BoundaryFace::inner_x3] == BoundaryFlag::periodic)
+          floc.lx3 = (nrbx3 << (lev+1)) - 1;
+        else
+          fskip = true;
+      } else if (floc.lx3 >= (nrbx3 << (lev+1))) {
+        if (mesh_bcs[BoundaryFace::outer_x3] == BoundaryFlag::periodic)
+          floc.lx3 = 0;
+        else
+          fskip = true;
+      }
+      if (!fskip) {
+        auto itr = locmap_[lev].find(floc);
+        if (itr != locmap_[lev].end()) {
+          const auto& it = *itr;
+          int i = it.second;
+          if (ranklist[i] == Globals::my_rank) {
+            ffc_send_.emplace_back(i, n, xf3^1 + 4, sffc3, -1);
+            if (newrank[n] == Globals::my_rank)
+              ffc_recv_.emplace_back(i, n, xf3 + 4, 0, s);
+            s++;
+          } else if (newrank[n] == Globals::my_rank) {
+            ffc_recv_.emplace_back(i, n, xf3 + 4, sffc3, -1);
           }
         }
       }
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 435d85f32a..4d7b09d006 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -518,6 +518,9 @@ Mesh::Mesh(ParameterInput *pin, int mesh_test) :
     bnderef = new int[Globals::nranks];
     brdisp = new int[Globals::nranks];
     bddisp = new int[Globals::nranks];
+    int nlevel = pin->GetOrAddInteger("mesh", "numlevel", 1);
+    locmap_ = new std::unordered_map<LogicalLocation, int,
+                                     LogicalLocationHash>[nlevel - 1];
   }
 
   // initialize cost array with the simplest estimate; all the blocks are equal
@@ -857,6 +860,9 @@ Mesh::Mesh(ParameterInput *pin, IOWrapper& resfile, int mesh_test) :
     bnderef = new int[Globals::nranks];
     brdisp = new int[Globals::nranks];
     bddisp = new int[Globals::nranks];
+    int nlevel = pin->GetOrAddInteger("mesh", "numlevel", 1);
+    locmap_ = new std::unordered_map<LogicalLocation, int,
+                                     LogicalLocationHash>[nlevel - 1];
   }
 
   CalculateLoadBalance(costlist, ranklist, nslist, nblist, nbtotal);
@@ -971,6 +977,7 @@ Mesh::~Mesh() {
     delete [] bnderef;
     delete [] brdisp;
     delete [] bddisp;
+    delete [] locmap_;
   }
   // delete user Mesh data
   if (nreal_user_mesh_data_>0) delete [] ruser_mesh_data;

From ab7d5fede655bf23e2717ce47a7ef20a05cacea5 Mon Sep 17 00:00:00 2001
From: Kengo Tomida <tomida@astr.tohoku.ac.jp>
Date: Mon, 2 Sep 2024 00:42:51 +0900
Subject: [PATCH 3/9] Style and comment fix

---
 src/mesh/amr_loadbalance.cpp | 14 +++++++-------
 src/mesh/mesh.hpp            |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 6de13550bb..8864da054e 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -1225,7 +1225,7 @@ void Mesh::ProlongateMeshBlock(MeshBlock *pb) {
 
   int il = pb->cis, iu = pb->cie+1, jl = pb->cjs, ju = pb->cje + f2,
       kl = pb->cks, ku = pb->cke + f3;
-  // Step FFC6. skip the surface fields contacting previously refined cells
+  // Step FFC8. skip the surface fields contacting previously refined MeshBlocks
   if (pmr->flag_ffc_recv_[BoundaryFace::inner_x1]) il++;
   if (pmr->flag_ffc_recv_[BoundaryFace::outer_x1]) iu--;
   if (pmr->flag_ffc_recv_[BoundaryFace::inner_x2]) jl++;
@@ -1272,7 +1272,7 @@ void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
     }
   }
 
-  // Step FFC1. Find pairs that need face field correction
+  // Step FFC2. Find pairs that need face field correction
   int s = 0;
   for (int n : refined_) { // loop over refined MeshBlocks
     LogicalLocation const &nloc = newloc[n];
@@ -1375,7 +1375,7 @@ void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
     }
   }
 
-  // Step FFC2. Initiate MPI receive
+  // Step FFC3. Initiate MPI receive
 #ifdef MPI_PARALLEL
   for (FaceFieldCorrection &t : ffc_recv_) {
     if (ranklist[t.from] != Globals::my_rank) {
@@ -1386,7 +1386,7 @@ void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
   }
 #endif
 
-  // Step FFC3. Pack and send shared face fields
+  // Step FFC4. Pack and send shared face fields
   for (FaceFieldCorrection &f : ffc_send_) {
     MeshBlock *pmb = FindMeshBlock(f.from);
     int p = 0;
@@ -1437,14 +1437,14 @@ void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
 //! \brief the second part of the face field correction
 
 void Mesh::ReceiveAndSetFaceFieldCorrection(int *newrank) {
-  // Step FFC4. clear face field correction flags
+  // Step FFC5. clear face field correction flags
   for (int n = 0; n < nblocal; ++n) {
     MeshRefinement *pmr = my_blocks(n)->pmr;
     for (int i = 0; i < 6; ++i)
       pmr->flag_ffc_recv_[i] = false;
   }
 
-  // Step FFC5. wait and unpack/set
+  // Step FFC6. wait and unpack/set
   for (FaceFieldCorrection &t : ffc_recv_) {
     MeshBlock *pmb = FindMeshBlock(t.to);
     pmb->pmr->flag_ffc_recv_[t.face] = true;
@@ -1488,7 +1488,7 @@ void Mesh::ReceiveAndSetFaceFieldCorrection(int *newrank) {
     }
   }
 
-  // Step FFC6. Finalize MPI send
+  // Step FFC7. Finalize MPI send
 #ifdef MPI_PARALLEL
   for (FaceFieldCorrection &f : ffc_send_) {
     if (newrank[f.to] != Globals::my_rank) {
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index 427182057b..c6f1349826 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -16,8 +16,8 @@
 // C++ headers
 #include <cstdint>     // int64_t
 #include <functional>  // reference_wrapper
-#include <unordered_map>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 // Athena++ headers

From 578e0d0ae3fbcf116e8ad978b4e2b3799291a445 Mon Sep 17 00:00:00 2001
From: Kengo Tomida <tomida@astr.tohoku.ac.jp>
Date: Tue, 10 Sep 2024 15:16:54 +0900
Subject: [PATCH 4/9] locmap was not cleared correctly

---
 src/mesh/amr_loadbalance.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 8864da054e..0219f9418d 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -656,7 +656,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) {
     refined_.clear();
     ffc_send_.clear();
     ffc_recv_.clear();
-    for (int l = 0; l < max_level - root_level - 1; ++l)
+    for (int l = 0; l < max_level - root_level; ++l)
       locmap_[l].clear();
   }
 

From 5413eecf26a4bb3e979b4154c6fe2af099e47f21 Mon Sep 17 00:00:00 2001
From: Kengo Tomida <tomida@astr.tohoku.ac.jp>
Date: Thu, 19 Sep 2024 01:44:02 +0900
Subject: [PATCH 5/9] Minor update for performance

Strore the AMR buffer sizes instead of calculating it at every step.
---
 src/mesh/amr_loadbalance.cpp | 50 +++------------------
 src/mesh/mesh.cpp            | 86 ++++++++++++++++++++++++++++++++++++
 src/mesh/mesh.hpp            |  1 +
 3 files changed, 92 insertions(+), 45 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 0219f9418d..7a3df121f1 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -396,10 +396,6 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) {
   int nbs = nslist[Globals::my_rank];
   int nbe = nbs + nblist[Globals::my_rank] - 1;
 
-  const int bnx1 = my_blocks(0)->block_size.nx1;
-  const int bnx2 = my_blocks(0)->block_size.nx2;
-  const int bnx3 = my_blocks(0)->block_size.nx3;
-
 #ifdef MPI_PARALLEL
   // Step 3. count the number of the blocks to be sent / received
   int nsend = 0, nrecv = 0;
@@ -428,45 +424,9 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) {
     }
   }
 
-  // Step 4. calculate buffer sizes
+  // Step 4. allocate and start receiving buffers
   Real **sendbuf, **recvbuf;
-  // use the first MeshBlock in the linked list of blocks belonging to this MPI rank as a
-  // representative of all MeshBlocks for counting the "load-balancing registered" and
-  // "SMR/AMR-enrolled" quantities (loop over MeshBlock::vars_cc_, not MeshRefinement)
-
-  //! \todo (felker):
-  //! * add explicit check to ensure that elements of pb->vars_cc/fc_ and
-  //!   pb->pmr->pvars_cc/fc_ v point to the same objects, if adaptive
-
-  // int num_cc = my_blocks(0)->pmr->pvars_cc_.size();
-  int num_fc = my_blocks(0)->vars_fc_.size();
-  int nx4_tot = 0;
-  for (AthenaArray<Real> &var_cc : my_blocks(0)->vars_cc_) {
-    nx4_tot += var_cc.GetDim4();
-  }
-  // radiation variables are not included in vars_cc as they need different order
-  if ((NR_RADIATION_ENABLED|| IM_RADIATION_ENABLED)) {
-    nx4_tot += my_blocks(0)->pnrrad->ir.GetDim1();
-  }
-
-  // cell-centered quantities enrolled in SMR/AMR
-  int bssame = bnx1*bnx2*bnx3*nx4_tot;
-  int bsf2c = (bnx1/2)*((bnx2 + 1)/2)*((bnx3 + 1)/2)*nx4_tot;
-  int bsc2f = (bnx1/2 + 2)*((bnx2 + 1)/2 + 2*f2)*((bnx3 + 1)/2 + 2*f3)*nx4_tot;
-  // face-centered quantities enrolled in SMR/AMR
-  bssame += num_fc*((bnx1 + 1)*bnx2*bnx3 + bnx1*(bnx2 + f2)*bnx3
-                    + bnx1*bnx2*(bnx3 + f3));
-  bsf2c += num_fc*(((bnx1/2) + 1)*((bnx2 + 1)/2)*((bnx3 + 1)/2)
-                   + (bnx1/2)*(((bnx2 + 1)/2) + f2)*((bnx3 + 1)/2)
-                   + (bnx1/2)*((bnx2 + 1)/2)*(((bnx3 + 1)/2) + f3));
-  bsc2f += num_fc*(((bnx1/2) + 1 + 2)*((bnx2 + 1)/2 + 2*f2)*((bnx3 + 1)/2 + 2*f3)
-                   + (bnx1/2 + 2)*(((bnx2 + 1)/2) + f2 + 2*f2)*((bnx3 + 1)/2 + 2*f3)
-                   + (bnx1/2 + 2)*((bnx2 + 1)/2 + 2*f2)*(((bnx3 + 1)/2) + f3 + 2*f3));
-  // add one more element to buffer size for storing the derefinement counter
-  bssame++;
-
   MPI_Request *req_send, *req_recv;
-  // Step 5. allocate and start receiving buffers
   if (nrecv != 0) {
     recvbuf = new Real*[nrecv];
     req_recv = new MPI_Request[nrecv];
@@ -503,7 +463,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) {
       }
     }
   }
-  // Step 6. allocate, pack and start sending buffers
+  // Step 5. allocate, pack and start sending buffers
   if (nsend != 0) {
     sendbuf = new Real*[nsend];
     req_send = new MPI_Request[nsend];
@@ -550,7 +510,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) {
   if (my_blocks(0)->pmr->pvars_fc_.size() > 0)
     PrepareAndSendFaceFieldCorrection(newloc, ranklist, newrank, nslist, nbtold);
 
-  // Step 7. construct a new MeshBlock list (moving the data within the MPI rank)
+  // Step 6. construct a new MeshBlock list (moving the data within the MPI rank)
   AthenaArray<MeshBlock*> newlist;
   newlist.NewAthenaArray(nblist[Globals::my_rank]);
   RegionSize block_size = my_blocks(0)->block_size;
@@ -600,7 +560,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) {
   gids_ = nbs;
   gide_ = nbe;
 
-  // Step 8. Receive the data and load into MeshBlocks
+  // Step 7. Receive the data and load into MeshBlocks
   // This is a test: try MPI_Waitall later.
 #ifdef MPI_PARALLEL
   if (nrecv != 0) {
@@ -635,7 +595,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, int ntot) {
   if (my_blocks(0)->pmr->pvars_fc_.size() > 0)
     ReceiveAndSetFaceFieldCorrection(newrank);
 
-  // Step 8b. Prolongate MeshBlocks
+  // Step 7b. Prolongate MeshBlocks
   for (int n=nbs; n<=nbe; n++) {
     int on = newtoold[n];
     LogicalLocation const &oloc = loclist[on];
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 4d7b09d006..9efe9570dc 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -568,6 +568,49 @@ Mesh::Mesh(ParameterInput *pin, int mesh_test) :
 
   if (turb_flag > 0) // TurbulenceDriver depends on the MeshBlock ctor
     ptrbd = new TurbulenceDriver(this, pin);
+
+#ifdef MPI_PARALLEL
+  if (adaptive) {
+    const int bnx1 = block_size.nx1;
+    const int bnx2 = block_size.nx2;
+    const int bnx3 = block_size.nx3;
+
+    // use the first MeshBlock in the linked list of blocks belonging to this MPI rank as a
+    // representative of all MeshBlocks for counting the "load-balancing registered" and
+    // "SMR/AMR-enrolled" quantities (loop over MeshBlock::vars_cc_, not MeshRefinement)
+
+    //! \todo (felker):
+    //! * add explicit check to ensure that elements of pb->vars_cc/fc_ and
+    //!   pb->pmr->pvars_cc/fc_ v point to the same objects, if adaptive
+
+    // int num_cc = my_blocks(0)->pmr->pvars_cc_.size();
+    int num_fc = my_blocks(0)->vars_fc_.size();
+    int nx4_tot = 0;
+    for (AthenaArray<Real> &var_cc : my_blocks(0)->vars_cc_) {
+      nx4_tot += var_cc.GetDim4();
+    }
+    // radiation variables are not included in vars_cc as they need different order
+    if ((NR_RADIATION_ENABLED|| IM_RADIATION_ENABLED)) {
+      nx4_tot += my_blocks(0)->pnrrad->ir.GetDim1();
+    }
+
+    // cell-centered quantities enrolled in SMR/AMR
+    bssame = bnx1*bnx2*bnx3*nx4_tot;
+    bsf2c = (bnx1/2)*((bnx2 + 1)/2)*((bnx3 + 1)/2)*nx4_tot;
+    bsc2f = (bnx1/2 + 2)*((bnx2 + 1)/2 + 2*f2)*((bnx3 + 1)/2 + 2*f3)*nx4_tot;
+    // face-centered quantities enrolled in SMR/AMR
+    bssame += num_fc*((bnx1 + 1)*bnx2*bnx3 + bnx1*(bnx2 + f2)*bnx3
+                      + bnx1*bnx2*(bnx3 + f3));
+    bsf2c += num_fc*(((bnx1/2) + 1)*((bnx2 + 1)/2)*((bnx3 + 1)/2)
+                     + (bnx1/2)*(((bnx2 + 1)/2) + f2)*((bnx3 + 1)/2)
+                     + (bnx1/2)*((bnx2 + 1)/2)*(((bnx3 + 1)/2) + f3));
+    bsc2f += num_fc*(((bnx1/2) + 1 + 2)*((bnx2 + 1)/2 + 2*f2)*((bnx3 + 1)/2 + 2*f3)
+                     + (bnx1/2 + 2)*(((bnx2 + 1)/2) + f2 + 2*f2)*((bnx3 + 1)/2 + 2*f3)
+                     + (bnx1/2 + 2)*((bnx2 + 1)/2 + 2*f2)*(((bnx3 + 1)/2) + f3 + 2*f3));
+    // add one more element to buffer size for storing the derefinement counter
+    bssame++;
+  }
+#endif
 }
 
 //----------------------------------------------------------------------------------------
@@ -950,6 +993,49 @@ Mesh::Mesh(ParameterInput *pin, IOWrapper& resfile, int mesh_test) :
 
   if (turb_flag > 0) // TurbulenceDriver depends on the MeshBlock ctor
     ptrbd = new TurbulenceDriver(this, pin);
+
+#ifdef MPI_PARALLEL
+  if (adaptive) {
+    const int bnx1 = block_size.nx1;
+    const int bnx2 = block_size.nx2;
+    const int bnx3 = block_size.nx3;
+
+    // use the first MeshBlock in the linked list of blocks belonging to this MPI rank as a
+    // representative of all MeshBlocks for counting the "load-balancing registered" and
+    // "SMR/AMR-enrolled" quantities (loop over MeshBlock::vars_cc_, not MeshRefinement)
+
+    //! \todo (felker):
+    //! * add explicit check to ensure that elements of pb->vars_cc/fc_ and
+    //!   pb->pmr->pvars_cc/fc_ v point to the same objects, if adaptive
+
+    // int num_cc = my_blocks(0)->pmr->pvars_cc_.size();
+    int num_fc = my_blocks(0)->vars_fc_.size();
+    int nx4_tot = 0;
+    for (AthenaArray<Real> &var_cc : my_blocks(0)->vars_cc_) {
+      nx4_tot += var_cc.GetDim4();
+    }
+    // radiation variables are not included in vars_cc as they need different order
+    if ((NR_RADIATION_ENABLED|| IM_RADIATION_ENABLED)) {
+      nx4_tot += my_blocks(0)->pnrrad->ir.GetDim1();
+    }
+
+    // cell-centered quantities enrolled in SMR/AMR
+    bssame = bnx1*bnx2*bnx3*nx4_tot;
+    bsf2c = (bnx1/2)*((bnx2 + 1)/2)*((bnx3 + 1)/2)*nx4_tot;
+    bsc2f = (bnx1/2 + 2)*((bnx2 + 1)/2 + 2*f2)*((bnx3 + 1)/2 + 2*f3)*nx4_tot;
+    // face-centered quantities enrolled in SMR/AMR
+    bssame += num_fc*((bnx1 + 1)*bnx2*bnx3 + bnx1*(bnx2 + f2)*bnx3
+                      + bnx1*bnx2*(bnx3 + f3));
+    bsf2c += num_fc*(((bnx1/2) + 1)*((bnx2 + 1)/2)*((bnx3 + 1)/2)
+                     + (bnx1/2)*(((bnx2 + 1)/2) + f2)*((bnx3 + 1)/2)
+                     + (bnx1/2)*((bnx2 + 1)/2)*(((bnx3 + 1)/2) + f3));
+    bsc2f += num_fc*(((bnx1/2) + 1 + 2)*((bnx2 + 1)/2 + 2*f2)*((bnx3 + 1)/2 + 2*f3)
+                     + (bnx1/2 + 2)*(((bnx2 + 1)/2) + f2 + 2*f2)*((bnx3 + 1)/2 + 2*f3)
+                     + (bnx1/2 + 2)*((bnx2 + 1)/2 + 2*f2)*(((bnx3 + 1)/2) + f3 + 2*f3));
+    // add one more element to buffer size for storing the derefinement counter
+    bssame++;
+  }
+#endif
 }
 
 //----------------------------------------------------------------------------------------
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index c6f1349826..24fb49b34a 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -341,6 +341,7 @@ class Mesh {
   bool lb_flag_, lb_automatic_, lb_manual_;
   double lb_tolerance_;
   int lb_interval_;
+  int bssame, bsf2c, bsc2f;
 
   // for AMR face field correction
   std::unordered_map<LogicalLocation, int, LogicalLocationHash> *locmap_;

From 19c0c29aef64cae5c15ec031da67dbe1c547143e Mon Sep 17 00:00:00 2001
From: Kengo Tomida <tomida@astr.tohoku.ac.jp>
Date: Thu, 19 Sep 2024 21:40:03 +0900
Subject: [PATCH 6/9] style fix

---
 src/mesh/mesh.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 9efe9570dc..eef2015ff6 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -575,8 +575,8 @@ Mesh::Mesh(ParameterInput *pin, int mesh_test) :
     const int bnx2 = block_size.nx2;
     const int bnx3 = block_size.nx3;
 
-    // use the first MeshBlock in the linked list of blocks belonging to this MPI rank as a
-    // representative of all MeshBlocks for counting the "load-balancing registered" and
+    // use the first MeshBlock in the linked list of blocks belonging to this MPI rank as
+    // a representative of all MeshBlocks for counting the "load-balancing registered" and
     // "SMR/AMR-enrolled" quantities (loop over MeshBlock::vars_cc_, not MeshRefinement)
 
     //! \todo (felker):
@@ -1000,8 +1000,8 @@ Mesh::Mesh(ParameterInput *pin, IOWrapper& resfile, int mesh_test) :
     const int bnx2 = block_size.nx2;
     const int bnx3 = block_size.nx3;
 
-    // use the first MeshBlock in the linked list of blocks belonging to this MPI rank as a
-    // representative of all MeshBlocks for counting the "load-balancing registered" and
+    // use the first MeshBlock in the linked list of blocks belonging to this MPI rank as
+    // a representative of all MeshBlocks for counting the "load-balancing registered" and
     // "SMR/AMR-enrolled" quantities (loop over MeshBlock::vars_cc_, not MeshRefinement)
 
     //! \todo (felker):

From 4a27016baad4680efcf8fd6825be22d35f02c284 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 14 Oct 2024 14:37:05 -0500
Subject: [PATCH 7/9] Update src/mesh/amr_loadbalance.cpp

---
 src/mesh/amr_loadbalance.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 7a3df121f1..31a18afec7 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -1220,7 +1220,7 @@ void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
   const int bnx1 = my_blocks(0)->block_size.nx1;
   const int bnx2 = my_blocks(0)->block_size.nx2;
   const int bnx3 = my_blocks(0)->block_size.nx3;
-  const int nf = my_blocks(0)->pmr->pvars_fc_.size();
+  const int nf = static_cast<int>(my_blocks(0)->pmr->pvars_fc_.size());
   const int sffc1 = nf*bnx2*bnx3, sffc2 = nf*bnx1*bnx3, sffc3 = nf*bnx2*bnx3;
 
   // Step FFC1. Construct a hash map of MeshBlocks before refinement

From b7e8f9959220362d38ed89991fce9e4e1480f892 Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Mon, 14 Oct 2024 21:05:15 -0500
Subject: [PATCH 8/9] Apply suggestions from code review

---
 src/mesh/amr_loadbalance.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 31a18afec7..352f59518b 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -1293,7 +1293,7 @@ void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
           const auto& it = *itr;
           int i = it.second;
           if (ranklist[i] == Globals::my_rank) {
-            ffc_send_.emplace_back(i, n, xf2^1 + 2, sffc2, -1);
+            ffc_send_.emplace_back(i, n, xf2^(1 + 2), sffc2, -1);
             if (newrank[n] == Globals::my_rank)
               ffc_recv_.emplace_back(i, n, xf2 + 2, 0, s);
             s++;
@@ -1323,7 +1323,7 @@ void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
           const auto& it = *itr;
           int i = it.second;
           if (ranklist[i] == Globals::my_rank) {
-            ffc_send_.emplace_back(i, n, xf3^1 + 4, sffc3, -1);
+            ffc_send_.emplace_back(i, n, xf3^(1 + 4), sffc3, -1);
             if (newrank[n] == Globals::my_rank)
               ffc_recv_.emplace_back(i, n, xf3 + 4, 0, s);
             s++;

From fed4b64c9856f1daeb4c78c4c18bae38a484434c Mon Sep 17 00:00:00 2001
From: Kengo TOMIDA <tomida@astr.tohoku.ac.jp>
Date: Tue, 15 Oct 2024 12:20:04 +0900
Subject: [PATCH 9/9] Update amr_loadbalance.cpp

It was my mistake and I have to thank the compiler.
---
 src/mesh/amr_loadbalance.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 352f59518b..7a140bed6f 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -1293,7 +1293,7 @@ void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
           const auto& it = *itr;
           int i = it.second;
           if (ranklist[i] == Globals::my_rank) {
-            ffc_send_.emplace_back(i, n, xf2^(1 + 2), sffc2, -1);
+            ffc_send_.emplace_back(i, n, (xf2^1) + 2, sffc2, -1);
             if (newrank[n] == Globals::my_rank)
               ffc_recv_.emplace_back(i, n, xf2 + 2, 0, s);
             s++;
@@ -1323,7 +1323,7 @@ void Mesh::PrepareAndSendFaceFieldCorrection(LogicalLocation *newloc,
           const auto& it = *itr;
           int i = it.second;
           if (ranklist[i] == Globals::my_rank) {
-            ffc_send_.emplace_back(i, n, xf3^(1 + 4), sffc3, -1);
+            ffc_send_.emplace_back(i, n, (xf3^1) + 4, sffc3, -1);
             if (newrank[n] == Globals::my_rank)
               ffc_recv_.emplace_back(i, n, xf3 + 4, 0, s);
             s++;