diff --git a/src/codegen/code_gen_cuda.cc b/src/codegen/code_gen_cuda.cc
index 8eaf5df15..c6f717f13 100644
--- a/src/codegen/code_gen_cuda.cc
+++ b/src/codegen/code_gen_cuda.cc
@@ -206,13 +206,20 @@ bool CodeGenCUDA::canRunInKernel(const Stmt &stmt) {
         // No VarDef here, because memory are more likely to be wasted
         // (allocated in a more conservative way) inside a kernel due to lack of
         // synchronization
-
-        // TODO: If we are going to implement hybrid CPU-GPU parallelization, we
-        // also need to reject any OpenMP scope here
-
         return false;
     }
 
+    for (auto &&_loop : findAllStmt(stmt, "<For>")) {
+        auto &&loop = _loop.as<ForNode>();
+        // If some inner loops is already parallelized, we can't safely extend
+        // the kernel scope, otherwise a barrier at the end of each kernel
+        // launch is ignored. This branch also reject OpenMP scopes for (maybe
+        // future) hybrid CPU-GPU parallelization.
+        if (loop->property_->parallel_ != serialScope) {
+            return false;
+        }
+    }
+
     for (auto &&var : allUses(stmt)) {
         auto mtype = buffer(var)->mtype();
         if (mtype != MemType::GPULocal && mtype != MemType::GPUWarp &&