From eee9d31ee2761171a937f49be04708595530efb6 Mon Sep 17 00:00:00 2001
From: Clark Chin <xi2.chen@intel.com>
Date: Wed, 10 Jan 2024 09:53:38 +0800
Subject: [PATCH] add shm and model enabled with CI case

Signed-off-by: Clark Chin <xi2.chen@intel.com>
---
 .github/workflows/scripts/models/run_tp.sh    | 188 ++++++++++++++++++
 neural_speed/application/main_run.cpp         |   9 +
 neural_speed/core/layers/mha_dense.cpp        |  39 +++-
 neural_speed/core/ne_layers.c                 | 143 +++++--------
 neural_speed/core/parallel_context.cpp        |  43 +++-
 neural_speed/core/shared_memory_ccl.hpp       | 136 +++++++++++++
 neural_speed/models/baichuan/baichuan.cpp     |  32 ++-
 neural_speed/models/chatglm/chatglm2.cpp      |  35 +++-
 neural_speed/models/gptj/gptj.cpp             |   7 -
 neural_speed/models/model_utils/model_files.h | 106 ++++++++--
 .../models/model_utils/model_utils.cpp        |  23 ++-
 11 files changed, 618 insertions(+), 143 deletions(-)
 create mode 100644 .github/workflows/scripts/models/run_tp.sh
 create mode 100644 neural_speed/core/shared_memory_ccl.hpp

diff --git a/.github/workflows/scripts/models/run_tp.sh b/.github/workflows/scripts/models/run_tp.sh
new file mode 100644
index 000000000..02ca44949
--- /dev/null
+++ b/.github/workflows/scripts/models/run_tp.sh
@@ -0,0 +1,188 @@
+#!/bin/bash
+set -eo pipefail
+set -x
+
+# IMPORTANT! we use half of one socket cores to simulate TP functionality
+cores_list=(24)
+# model_list=("llama2-7b" "llama2-13b" "llama2-70b" "gptj-6b" "baichuan-13b")
+model_list=("llama2-7b")
+input_list=(32 1024 2016)
+output=32
+beam_list=(1)
+precision_list=("q4_j")
+
+function main {
+    # conda_env="$1"
+    # get the compiler version
+    gcc_version_info=$(gcc --version)
+    compiler_version=$(echo "$gcc_version_info" | grep -oP 'gcc \(.*\) \K[0-9]+\.[0-9]+\.[0-9]+' | head -1)
+
+    working_dir="${WORKING_DIR}"
+    scripts_dir="${working_dir}/neural_speed/convert"
+
+    # init conda
+    # . $(dirname ${CONDA_EXE})/../etc/profile.d/conda.sh
+    # conda activate $conda_env || source activate $conda_env
+    pip install cmake psutil
+    if [[ "${compiler_version}" != "12.3.0" ]]; then
+        conda install --update-deps -c conda-forge gxx==${compiler_version} gcc==${compiler_version} gxx_linux-64==${compiler_version} libstdcxx-ng sysroot_linux-64 -y
+    fi
+
+    # check oneCCL and build
+    ccl_dir=${working_dir}/oneCCL/build/_install
+    if [ ! -d "$ccl_dir" ]; then
+        cd ${working_dir}
+        git clone https://github.com/intel/oneCCL.git
+        cd ${working_dir}/oneCCL
+        git checkout 2021.9
+        sed -i 's/cpu_gpu_dpcpp/./g' cmake/templates/oneCCLConfig.cmake.in
+        mkdir build && cd build
+        cmake ..
+        make -j install
+    fi
+    source ${ccl_dir}/env/setvars.sh
+
+    # compile binary
+    cd ${working_dir}
+    if [ ! -d "$ccl_dir" ]; then
+        mkdir build
+    fi
+    cd build
+    cmake -DNE_TP=ON .. 
+    make -j
+    cd ..
+
+    ## prepare example requiement
+    pip install -r requirements.txt
+
+    # launch benchmark
+    for model in ${model_list[@]}; do
+        if [[ "${model}" == "llama2-7b" ]]; then
+            convert_script="${scripts_dir}/convert_llama.py"
+            quant_script="./build/bin/quant_llama"
+            infer_cmd="./build/bin/run_llama"
+            input_model="/tf_dataset2/models/nlp_toolkit/llama-2-7b/Llama-2-7b-hf"
+        elif [[ "${model}" == "llama2-13b" ]]; then
+            convert_script="${scripts_dir}/convert_llama.py"
+            quant_script="./build/bin/quant_llama"
+            infer_cmd="./build/bin/run_llama"
+            input_model="/tf_dataset2/models/nlp_toolkit/llama-2-13b/Llama-2-13b-hf"
+        elif [[ "${model}" == "llama2-70b" ]]; then
+            convert_script="${scripts_dir}/convert_llama.py"
+            quant_script="./build/bin/quant_llama"
+            infer_cmd="./build/bin/run_llama"
+            input_model="/tf_dataset2/models/nlp_toolkit/llama-2-70b/Llama-2-70b-hf"
+        elif [[ "${model}" == "gptj-6b" ]]; then
+            convert_script="${scripts_dir}/convert_gptj.py"
+            quant_script="./build/bin/quant_gptj"
+            infer_cmd="./build/bin/run_gptj"
+            input_model="/tf_dataset2/models/pytorch/gpt-j-6B"
+        elif [[ "${model}" == "baichuan-13b" ]]; then
+            convert_script="${scripts_dir}/convert_baichuan.py"
+            quant_script="./build/bin/quant_baichuan"
+            infer_cmd="./build/bin/run_baichuan --repeat-penalty 1.1 --keep 0 --batch-size-truncate 512"
+            input_model="/tf_dataset2/models/pytorch/baichuan-13B"
+        fi
+        ## prepare fp32 bin if not exists
+        f32_model=${working_dir}/${model}-f32.bin
+        if [ ! -f "$f32_model" ]; then
+          python ${convert_script} --outtype f32 --outfile ${f32_model} ${input_model}
+        fi
+        for cores_per_instance in ${cores_list[@]}; do
+            for input in ${input_list[@]}; do
+                for precision in ${precision_list[@]}; do
+                    # [[ "${input}" == "32" ]] && output=32 ||
+                    if [[ "${input}" == "32" ]]; then
+                        if [[ "${model}" == "baichuan"* ]]; then
+                            ids="195, 13602, 5920, 1346, 1750, 92323, 1772, 29165, 1346, 3110, 5074, 92323, 1642, 16077, 1375,1527, 27044, 72, 2556, 4891, 1375, 1596, 1375, 6679, 1377, 3099, 1662, 1808, 92323, 1377, 1527, 2431, 72, 196" 
+                        else
+                            prompt="Once upon a time, there existed a little girl, who liked to have adventures. She wanted to go to places and meet new people, and have fun."
+                        fi
+                    elif [[ "${input}" == "2016" ]]; then
+                        if [[ "${model}" == "llama"* ]]; then
+
+                            prompt="It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I have not seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill. Once a pasta is in the vicinity of a plate, it starts conquering it for its team. It takes around 10 seconds for a plate to be conquered; less if more pasta from the same team are around. If pasta from other team are around, though, they get locked down in their attempt, unable to conquer the plate, until one of them die (think Battlefield's standard 'Conquest' mode). You get points every second for every plate you own. Over time, the concept also evolved to use an Italian bistro as its main scenario. Carlos, Carlos' Bistro's founder and owner Setup No major changes were made from my work setup. I used FDT and Starling creating an Adobe AIR (ActionScript) project, all tools or frameworks I already had some knowledge with. One big change for me was that I livestreamed my work through a twitch.tv account. This was a new thing for me. As recommended by Roushey, I used a program called XSplit and I got to say, it is pretty amazing. It made the livestream pretty effortless and the features are awesome, even for the free version. It was great to have some of my friends watch me, and then interact with them and random people through chat. It was also good knowing that I was also recording a local version of the files, so I could make a timelapse video later. Knowing the video was being recorded also made me a lot more self-conscious about my computer use, as if someone was watching over my shoulder. It made me realize that sometimes I spend too much time in seemingly inane tasks (I ended up wasting the longest time just to get some text alignment the way I wanted - it'll probably drive someone crazy if they watch it) and that I do way too many typos where writing code. I pretty much spend half of the time writing a line and the other half fixing the crazy characters in it. My own stream was probably boring to watch since I was coding for the most time. But livestreaming is one of the cool things to do as a spectator too. It was great seeing other people working - I had a few tabs opened on my second monitor all the time. It's actually a bit sad, because if I could, I could have spent the whole weekend just watching other people working! But I had to do my own work, so I'd only do it once in a while, when resting for a bit. Design Although I wanted some simple, low-fi, high-contrast kind of design, I ended up going with somewhat realistic (vector) art. I think it worked very well, fitting the mood of the game, but I also went overboard. For example: to know the state of a plate (who owns it, who's conquering it and how much time they have left before conquering it, which pasta units are in the queue, etc), you have to look at the plate's bill. The problem I realized when doing some tests is that people never look at the bill! They think it's some kind of prop, so they never actually read its details. Plus, if you're zoomed out too much, you can't actually read it, so it's hard to know what's going on with the game until you zoom in to the area of a specific plate. One other solution that didn't turn out to be as perfect as I thought was how to indicate who a plate base belongs to. In the game, that's indicated by the plate's decoration - its color denotes the team owner. But it's something that fits so well into the design that people never realized it, until they were told about it. In the end, the idea of going with a full physical metaphor is one that should be done with care. Things that are very important risk becoming background noise, unless the player knows its importance. Originally, I wanted to avoid any kind of heads-up display in my game. In the end, I ended up adding it at the bottom to indicate your credits and bases owned, as well as the hideous out-of-place-and-still-not-obvious 'Call Waiter' button. But in hindsight, I should have gone with a simple HUD from the start, especially one that indicated each team's colors and general state of the game without the need for zooming in and out. Development Development went fast. But not fast enough. Even though I worked around 32+ hours for this Ludum Dare, the biggest problem that I had to face in the end was overscoping. I had too much planned"
+                        elif [[ "${model}" == "baichuan"* ]]; then
+                            ids="195, 2287, 1414, 3904, 92323, 1377, 14448, 72, 2136, 1559, 1906, 2124, 20621, 92334, 3838, 1376, 1352, 1398, 1542, 11198, 92404, 1415, 10960, 92323, 1377, 1415, 1352, 5460, 72, 45986, 1415, 1352, 5460, 4067, 92323, 1619, 1438, 1527, 1375, 80244, 5957, 6548, 1412, 5163, 6830, 1377, 1434, 1559, 1425, 1346, 4177, 35074, 72, 2945, 1414, 1346, 2718, 1406, 92404, 92322, 1795, 1375, 3861, 1670, 72, 1406, 1556, 1596, 1899, 1987, 11550, 92323, 2139, 1370, 1376, 3206, 1352, 13972, 1817, 4105, 2126, 92395, 87412, 2611, 72, 37105, 18493, 1744, 1352, 10083, 1495, 5435, 1661, 1376, 1352, 37530, 15385, 1660, 1406 , 1755, 1375, 4467, 72, 48091, 92323, 1406, 1755, 1452, 4752, 1376, 1817, 3650, 1376, 2521, 1406, 4891, 1375, 2144, 92323, 31063, 26802, 92311, 63, 2982, 1436, 1346, 2718, 1376, 24411, 9 2395, 1621, 1712, 92323, 5384, 19316, 92323, 8737, 2048, 1374, 3970, 92323, 16099, 1513, 1346, 2608, 63, 5180, 3019, 72, 1406, 1495, 11115, 1434, 1406, 2006, 5939, 1780, 10083, 2386, 144 2, 72, 1593, 1352, 2163, 92323, 1352, 4243, 1436, 1346, 10083, 1795, 2124, 19176, 4282, 92404, 1374, 1346, 2521, 1414, 1434, 15562, 1414, 1639, 88730, 72, 1745, 9493, 1899, 3473, 21901, 13065, 52904, 1744, 1750, 92323, 1436, 1352, 1877, 30911, 13624, 66410, 35939, 72, 1956, 19109, 2183, 81273, 1414, 92323, 1374, 1751, 10306, 92323, 1346, 2220, 3937, 1376, 8961, 15562, 1 376, 1346, 8318, 11043, 1346, 7143, 72, 2400, 1414, 1442, 1346, 2521, 74, 1593, 1346, 2521, 92323, 1438, 1859, 1375, 3829, 2982, 1375, 6282, 1452, 17869, 72, 3246, 3829, 6084, 2681, 1817 , 15562, 1414, 11711, 1375, 1425, 1795, 72, 2615, 1438, 2754, 1352, 5322, 1375, 4714, 1847, 1375, 34734, 2982, 92323, 1442, 92404, 92319, 1538, 15562, 18766, 92311, 63, 1442, 92404, 9231 9, 1352, 16893, 1376, 25388, 2413, 92323, 1352, 1365, 1620, 36465, 1508, 11101, 2277, 1375, 14635, 1352, 4752, 1376, 15562, 72, 18282, 1720, 92317, 22979, 1377, 1346, 16870, 2968, 12542,  92323, 1434, 92404, 92319, 1538, 2982, 1434, 13666, 6609, 1643, 1352, 2126, 2088, 72, 38891, 92323, 1751, 7074, 1381, 4123, 3791, 1777, 29772, 1817, 1375, 3554, 1495, 1538, 1436, 1817, 1406, 4891, 1375, 3554, 92323, 1619, 1436, 1817, 1406, 2182, 1538, 72, 1406, 3753, 92404, 92313, 2030, 1375, 3554, 1452, 2124, 1700, 20361, 2413, 92404, 81273, 1377, 6441, 1421, 2181, 14 42, 15562, 72, 1956, 1414, 1346, 4243, 92323, 1376, 3583, 92323, 1987, 1721, 12511, 1506, 1696, 1755, 1375, 4467, 72, 2145, 52618, 1508, 1352, 22398, 14448, 92323, 1538, 2009, 9128, 1375 , 1656, 2386, 1442, 72, 1406, 92404, 92322, 2596, 1352, 1911, 2705, 6518, 1495, 1899, 1352, 2065, 1376, 19649, 9579, 92323, 19810, 72, 3177, 3404, 92323, 1406, 1527, 1538, 3952, 1780, 80 49, 2774, 1528, 1474, 1714, 9050, 31063, 72, 2116, 1382, 92323, 1528, 1414, 1829, 1346, 2431, 7749, 1377, 1887, 1346, 2310, 1406, 6187, 1538, 1375, 1425, 1449, 14985, 1436, 1352, 2521, 4 752, 92323, 1377, 6891, 7714, 1375, 4714, 10591, 1406, 3710, 1767, 1656, 1628, 72, 4009, 8396, 4752, 1495, 1375, 3554, 2982, 2003, 20027, 7156, 1375, 34734, 1375, 1346, 2763, 2963, 1619,  1755, 1773, 3650, 1376, 1365, 4242, 4705, 1375, 4027, 1771, 1513, 4000, 1679, 72, 1406, 3650, 1376, 1755, 1528, 5690, 1376, 3558, 44103, 16283, 1374, 3970, 6322, 1346, 2161, 50038, 1500 , 1346, 3970, 9383, 1458, 1522, 3443, 1374, 92311, 92338, 92335, 92335, 92336, 92345, 1401, 10351, 62076, 1376, 3583, 92351, 1619, 1406, 8418, 92404, 92313, 2336, 1376, 21650, 1458, 2278 , 92345, 5857, 92351, 34139, 1412, 1434, 72, 58958, 92319, 1738, 1751, 2763, 13556, 92323, 1449, 1609, 4654, 52260, 5939, 5773, 1972, 1812, 1352, 15562, 10083, 72, 2400, 1847, 1375, 1977 , 1442, 1656, 74, 5599, 1438, 1352, 1372, 3139, 92323, 1500, 11336, 1352, 58958, 74, 1481, 4120, 1377, 3895, 4752, 3642, 1375, 1643, 1899, 1751, 26645, 92323, 1642, 19810, 6302, 1643, 13 52, 4752, 1376, 3206, 2982, 1670, 1352, 15562, 1376, 16870, 92314, 72, 1481, 1629, 1406, 3710, 1670, 1442, 1352, 1629, 1442, 41324, 1795, 1442, 1767, 1656, 92323, 1679, 1406, 6187, 1375,  1596, 1436, 1442, 72, 56573, 1646, 1436, 1751, 21132, 1932, 63, 53865, 1451, 1612, 32546, 1458, 16349, 1696, 4627, 1352, 2124, 7376, 76994, 7255, 82749, 92404, 17865, 17179, 1412, 1751,  1598, 6888, 92351, 4163, 6043, 3020, 1352, 6779, 92323, 1449, 1442, 5305, 1812, 1352, 4752, 1376, 3623, 3702, 9283, 1376, 46981, 16283, 2386, 1377, 4705, 1375, 34734, 3428, 1636, 4676, 1581, 63, 9951, 1974, 72, 1401, 17082, 4752, 2328, 1495, 1434, 1352, 2521, 1767, 1656, 1375, 9543, 1847, 1352, 45634, 2312, 89442, 41739, 3642, 1375, 4482, 92311, 63, 1508, 30120, 1513, 1346, 8156, 12956, 5163, 72, 3177, 1352, 4752, 24976, 1629, 1500, 3129, 1812, 1528, 92345, 1438, 1484, 12396, 1346, 5163, 72, 2136, 1527, 1597, 2106, 15556, 92323, 1436, 1414, 1597, 2124 , 12539, 92404, 72, 2945, 1484, 92311, 92358, 1721, 10831, 1474, 1352, 5163, 92323, 2470, 1436, 1609, 2106, 15556, 72, 4902, 15556, 1559, 43826, 3110, 9283, 1376, 46981, 72, 2136, 1616, 1679, 1508, 2124, 77777, 92404, 1771, 1899, 1346, 12277, 72, 5751, 3127, 1382, 1484, 2928, 1765, 3663, 92399, 1773, 1484, 12143, 92323, 1773, 1484, 16310, 72, 2700, 1527, 32607, 2124, 27 824, 92319, 3505, 1660, 1484, 3949, 1960, 1513, 1597, 19002, 1458, 5035, 2104, 1436, 1346, 2717, 1376, 19002, 92351, 72, 10552, 87921, 92323, 1597, 3127, 1382, 2104, 16283, 2386, 72, 863 5, 39428, 1414, 1375, 12116, 1375, 1721, 29614, 92323, 1374, 3281, 1375, 63701, 1771, 1458, 2146, 17869, 1376, 1352, 2521, 1414, 3623, 1597, 46981, 63701, 1581, 1352, 29614, 1415, 1352, 5163, 92351, 72, 2400, 1636, 1484, 2570, 33603, 92323, 1679, 1887, 2176, 87921, 92323, 1438, 1527, 1908, 3829, 1744, 1597, 46981, 1458, 25815, 51297, 92343, 1500, 11041, 92378, 2284, 319 89, 92351, 72, 4902, 46981, 3956, 92404, 92313, 1795, 1721, 1808, 92404, 92319, 46981, 92323, 1679, 1841, 1636, 3099, 92323, 1636, 6450, 22308, 1474, 2470, 1721, 3428, 1661, 14585, 72, 2 136, 1762, 19002, 1412, 1721, 3127, 1382, 1597, 2106, 46981, 11451, 72, 10552, 1346, 46981, 1414, 1374, 1352, 61854, 1376, 1346, 15556, 92323, 1442, 9537, 49712, 2524, 1442, 1412, 1714, 2223, 72, 1745, 4860, 2386, 92311, 92336, 92335, 11294, 1412, 1346, 15556, 1375, 1425, 84565, 92399, 3129, 1841, 1629, 46981, 1513, 1352, 2493, 2223, 1484, 2386, 72, 2615, 46981, 1513, 1 721, 2223, 1484, 2386, 92323, 3914, 92323, 1636, 1762, 24462, 2257, 1374, 1609, 5591, 92323, 13208, 1375, 63701, 1352, 15556, 92323, 3428, 1661, 1376, 1771, 7442, 1458, 25815, 17637, 509 3, 92404, 92319, 6048, 2124, 3697, 6575, 92404, 11841, 92351, 72, 2136, 1762, 4507, 1987, 2689, 1412, 1987, 15556, 1438, 2106, 72, 6528, 1750, 92323, 1352, 6779, 1696, 24976, 1375, 2065,  1452, 10858, 1372, 73506, 1449, 1714, 2709, 20530, 72, 29684, 92323, 29684, 92404, 1432, 73506, 92404, 92319, 12360, 1377, 8436, 88243, 3244, 3266, 4798, 1738, 2212, 1513, 1751, 1656, 1 9151, 72, 1406, 2368, 1454, 11631, 1377, 6801, 2735, 6622, 1452, 38654, 75338, 1458, 18392, 21226, 92351, 2506, 92323, 1581, 7077, 1500, 54554, 1406, 3474, 1755, 1773, 5631, 1436, 72, 39 35, 2739, 3033, 1412, 1643, 1495, 1434, 1406, 91646, 1370, 1751, 1656, 1899, 1346, 1806, 6632, 72, 20523, 3677, 72, 1956, 1495, 1346, 1662, 3785, 1412, 1643, 72, 2348, 12535, 1508, 1451,  1612, 32546, 92323, 1406, 2368, 1346, 2395, 3330, 3464, 85479, 1377, 1406, 3113, 1375, 2596, 92323, 1442, 1414, 5773, 6981, 72, 1745, 2212, 1352, 91646, 5773, 82224, 1377, 1352, 4448, 1 484, 14727, 92323, 2128, 1412, 1352, 2929, 4684, 72, 1745, 1495, 2220, 1375, 1527, 1773, 1376, 1751, 4662, 4398, 1643, 92323, 1377, 2171, 16593, 1436, 1771, 1377, 13065, 1808, 1899, 1516 1, 72, 1745, 1495, 1696, 2211, 14102, 1434, 1406, 1495, 1696, 12276, 1346, 2978, 4684, 1376, 1352, 8282, 92323, 1679, 1406, 2006, 1977, 1346, 7177, 1391, 16143, 3832, 3665, 72, 65616, 13 52, 3832, 1495, 2176, 9542, 1696, 2212, 1643, 1346, 2718, 1629, 3797, 63, 41569, 1670, 1751, 6797, 2065, 92323, 1449, 1841, 4829, 1495, 9547, 1744, 1751, 22957, 72, 1745, 2212, 1643, 133 31, 1434, 7170, 1406, 7962, 2825, 2236, 1750, 1374, 21901, 1374, 3189, 15385, 1458, 92346, 8988, 1635, 64917, 1352, 24331, 1750, 1829, 1375, 1762, 1773, 4493, 42766, 1352, 2088, 1406, 48 91, 92311, 63, 1442, 92404, 1692, 5435, 5882, 4829, 16965, 1841, 1636, 4398, 1442, 92351, 1377, 1434, 1406, 1616, 2088, 2825, 2009, 2638, 3519, 2003, 5025, 5013, 72, 1406, 5773, 2236, 79 62, 4275, 1376, 1352, 1750, 5025, 1346, 3438, 1377, 1352, 1721, 4275, 47299, 1352, 16965, 6673, 1374, 1442, 72, 4009, 2106, 6822, 1495, 5435, 32471, 1375, 4398, 2657, 1406, 1495, 38661, 1412, 1352, 1877, 1750, 72, 2400, 91646, 1364, 1414, 1661, 1376, 1352, 7157, 2948, 1375, 1616, 1449, 1346, 8863, 2662, 2825, 72, 1745, 1495, 2220, 8383, 1721, 1808, 2951, 92311, 63, 1406 , 1755, 1346, 2749, 54544, 7828, 1415, 1751, 2689, 9058, 1581, 1352, 1750, 72, 1745, 92404, 92319, 4313, 1346, 4177, 12553, 92323, 2201, 1841, 1406, 2006, 92323, 1406, 2006, 1527, 6406, 1352, 4654, 7563, 1829, 9547, 1721, 1808, 2951, 73, 2400, 1406, 1755, 1375, 1616, 1751, 2106, 1656, 92323, 1679, 1406, 92404, 92322, 1911, 1616, 1442, 3989, 1374, 1346, 2310, 92323, 1777 , 52493, 1412, 1346, 4177, 72, 7487, 9731, 1406, 4891, 1773, 5384, 92323, 4033, 63, 21064, 92323, 2141, 63, 4551, 6176, 3650, 1376, 2413, 92323, 1406, 8988, 1635, 2582, 1436, 15809, 2508 2, 1458, 37294, 92351, 2120, 72, 1406, 2336, 1442, 4734, 2115, 1972, 92323, 28932, 1352, 18683, 1376, 1352, 2521, 92323, 1619, 1406, 1696, 4105, 1744, 5583, 72, 2331, 3937, 92345, 1375, 1996, 1352, 2688, 1376, 1346, 15556, 1458, 16349, 21600, 1442, 92323, 1642, 92404, 92319, 49712, 2524, 1442, 1377, 1847, 2236, 1750, 1636, 1527, 3353, 2262, 49712, 2524, 1442, 92323, 166 0, 46981, 9842, 1484, 1374, 1352, 41416, 92323, 6503, 1716, 1438, 1527, 1375, 2004, 1474, 1352, 15556, 92404, 92319, 6637, 72, 1481, 4243, 1406, 15255, 1777, 4000, 1773, 10398, 1414, 143 4, 1808, 3025, 2004, 1474, 1352, 6637, 73, 2700, 2336, 1442, 92404, 92319, 1773, 3650, 1376, 5139, 92323, 1679, 1636, 3025, 4313, 2309, 1714, 5430, 72, 13399, 92323, 1841, 1438, 92404, 1 349, 39739, 1370, 1628, 2825, 2236, 92323, 1438, 1559, 92404, 92313, 4313, 2309, 1442, 92323, 1679, 1442, 92404, 92319, 3013, 1375, 1996, 1817, 92404, 92319, 2582, 1415, 1436, 1352, 2521 , 3428, 1438, 39739, 1374, 1375, 1352, 3143, 1376, 1346, 4073, 15556, 72, 3935, 1721, 6518, 1434, 3753, 92404, 92313, 3007, 1628, 1375, 1425, 1449, 4479, 1449, 1406, 3710, 1495, 1847, 13 75, 17836, 1642, 1346, 15556, 5428, 29667, 1375, 72, 1593, 1352, 2521, 92323, 1434, 92404, 92319, 18711, 1508, 1352, 15556, 92404, 92319, 49131, 92311, 63, 1714, 5713, 5862, 11478, 1352,  2223, 8436, 72, 2400, 1442, 92404, 92319, 2982, 1434, 22949, 1679, 1972, 1812, 1352, 2413, 1434, 1808, 3025, 15255, 1442, 92323, 3428, 1636, 1738, 3755, 1670, 1442, 72, 1593, 1352, 2163 , 92323, 1352, 4752, 1376, 2582, 1436, 1346, 2994, 7019, 53345, 1414, 1661, 1434, 2166, 1425, 3904, 1436, 2370, 72, 14620, 1434, 1484, 2115, 2897, 4807, 8422, 7166, 17029, 92323, 11628, 1352, 5752, 9262, 1714, 9515, 72, 48091, 92323, 1406, 4891, 1375, 6599, 1780, 3650, 1376, 14491, 63, 2070, 6139, 1374, 1751, 2521, 72, 1593, 1352, 2163, 92323, 1406, 8988, 1635, 8715, 14 42, 1474, 1352, 9384, 1375, 17836, 1597, 19002, 1377, 29491, 11192, 92323, 1449, 1972, 1449, 1352, 19871, 1612, 1628, 63, 2568, 63, 6581, 63, 1464, 63, 46351, 63, 3556, 63, 63, 1704, 293 1, 2124, 13817, 24808, 4185, 92404, 10002, 72, 2400, 1374, 1385, 11147, 1529, 92323, 1406, 2166, 1527, 8364, 1436, 1346, 5384, 1445, 17183, 1513, 1352, 2104, 92323, 4633, 1661, 1434, 187 11, 2470, 2223, 92404, 92319, 12654, 1377, 4853, 2688, 1376, 1352, 2521, 2923, 1352, 1859, 1412, 39739, 1364, 1374, 1377, 1628, 72, 6881, 6881, 4105, 5697, 72, 2400, 1538, 5697, 3866, 72 , 7669, 3914, 1406, 4734, 2386, 92311, 92354, 92338, 62, 4213, 1412, 1528, 37050, 1477, 1443, 1588, 92323, 1352, 7074, 4243, 1434, 1406, 1755, 1375, 4467, 1374, 1352, 2163, 1495, 16525, 92324, 30245, 72, 1406, 1755, 2825, 2236, 10882, 72, 196" 
+                        elif [[ "${model}" == "gptj-6b" ]]; then
+                            prompt="It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill. Once a pasta is in the vicinity of a plate, it starts conquering it for its team. It takes around 10 seconds for a plate to be conquered; less if more pasta from the same team are around. If pasta from other team are around, though, they get locked down in their attempt, unable to conquer the plate, until one of them die (think Battlefield's standard 'Conquest' mode). You get points every second for every plate you own. Over time, the concept also evolved to use an Italian bistro as its main scenario. Carlos, Carlos' Bistro's founder and owner Setup No major changes were made from my work setup. I used FDT and Starling creating an Adobe AIR (ActionScript) project, all tools or frameworks I already had some knowledge with. One big change for me was that I livestreamed my work through a twitch.tv account. This was a new thing for me. As recommended by Roushey, I used a program called XSplit and I got to say, it is pretty amazing. It made the livestream pretty effortless and the features are awesome, even for the free version. It was great to have some of my friends watch me, and then interact with them and random people through chat. It was also good knowing that I was also recording a local version of the files, so I could make a timelapse video later. Knowing the video was being recorded also made me a lot more self-conscious about my computer use, as if someone was watching over my shoulder. It made me realize that sometimes I spend too much time in seemingly inane tasks (I ended up wasting the longest time just to get some text alignment the way I wanted - it'll probably drive someone crazy if they watch it) and that I do way too many typos where writing code. I pretty much spend half of the time writing a line and the other half fixing the crazy characters in it. My own stream was probably boring to watch since I was coding for the most time. But livestreaming is one of the cool things to do as a spectator too. It was great seeing other people working - I had a few tabs opened on my second monitor all the time. It's actually a bit sad, because if I could, I could have spent the whole weekend just watching other people working! But I had to do my own work, so I'd only do it once in a while, when resting for a bit. Design Although I wanted some simple, low-fi, high-contrast kind of design, I ended up going with somewhat realistic (vector) art. I think it worked very well, fitting the mood of the game, but I also went overboard. For example: to know the state of a plate (who owns it, who's conquering it and how much time they have left before conquering it, which pasta units are in the queue, etc), you have to look at the plate's bill. The problem I realized when doing some tests is that people never look at the bill! They think it's some kind of prop, so they never actually read its details. Plus, if you're zoomed out too much, you can't actually read it, so it's hard to know what's going on with the game until you zoom in to the area of a specific plate. One other solution that didn't turn out to be as perfect as I thought was how to indicate who a plate base belongs to. In the game, that's indicated by the plate's decoration - its color denotes the team owner. But it's something that fits so well into the design that people never realized it, until they were told about it. In the end, the idea of going with a full physical metaphor is one that should be done with care. Things that are very important risk becoming background noise, unless the player knows its importance. Originally, I wanted to avoid any kind of heads-up display in my game. In the end, I ended up adding it at the bottom to indicate your credits and bases owned, as well as the hideous out-of-place-and-still-not-obvious 'Call Waiter' button. But in hindsight, I should have gone with a simple HUD from the start, especially one that indicated each team's colors and general state of the game without the need for zooming in and out. Development Development went fast. But not fast enough. Even though I worked around 32+ hours for this Ludum Dare, the biggest problem I had to face in the end was overscoping. I had too much planned, and could not get it all done. Content-wise, I had several kinds of pasta planned - Wikipedia is just amazing in that regard, split into several different groups, from small Pastina to huge Pasta al forno. But because of time constraints, I ended up scratching most of them, and ended up with 5 different types of small pasta - barely something to start when talking about the evolution of Pasta. Pastas used in the game. Unfortunately, the macs where never used Which is one of the saddest things about the project, really. It had the framework and the features to allow an endless number of elements in there, but I just did not have time to draw the rest of the assets needed (something I loved to do, by the way)."
+                        else
+                            prompt="It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on web. Playing on web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill. Once a pasta is in the vicinity of a plate, it starts conquering it for its team. It takes around 10 seconds for a plate to be conquered; less if more pasta from the same team are around. If pasta from other team are around, though, they get locked down in their attempt, unable to conquer the plate, until one of them die (think Battlefield's standard 'Conquest' mode). You get points every second for every plate you own. Over time, the concept also evolved to use an Italian bistro as its main scenario. Carlos, Carlos' Bistro's founder and owner Setup No major changes were made from my work setup. I used FDT and Starling creating an Adobe AIR (ActionScript) project, all tools or frameworks I already had some knowledge with. One big change for me was that I livestreamed my work through a twitch.tv account. This was a new thing for me. As recommended by Roushey, I used a program called XSplit and I got to say, it is pretty amazing. It made the livestream pretty effortless and the features are awesome, even for the free version. It was great to have some of my friends watch me, and then interact with them and random people through chat. It was also good knowing that I was also recording a local version of the files, so I could make a timelapse video later. Knowing the video was being recorded also made me a lot more self-conscious about my computer use, as if someone was watching over my shoulder. It made me realize that sometimes I spend too much time in seemingly inane tasks (I ended up wasting the longest time just to get some text alignment the way I wanted - it'll probably drive someone crazy if they watch it) and that I do way too many typos where writing code. I pretty much spend half of the time writing a line and the other half fixing the crazy characters in it. My own stream was probably boring to watch since I was coding for the most time. But livestreaming is one of the cool things to do as a spectator too. It was great seeing other people working - I had a few tabs opened on my second monitor all the time. It's actually a bit sad, because if I could, I could have spent the whole weekend just watching other people working! But I had to do my own work, so I'd only do it once in a while, when resting for a bit. Design Although I wanted some simple, low-fi, high-contrast kind of design, I ended up going with somewhat realistic (vector) art. I think it worked very well, fitting the mood of the game, but I also went overboard. For example: to know the state of a plate (who owns it, who's conquering it and how much time they have left before conquering it, which pasta units are in the queue, etc), you have to look at the plate's bill. The problem I realized when doing some tests is that people never look at the bill! They think it's some kind of prop, so they never actually read its details. Plus, if you're zoomed out too much, you can't actually read it, so it's hard to know what's going on with the game until you zoom in to the area of a specific plate. One other solution that didn't turn out to be as perfect as I thought was how to indicate who a plate base belongs to. In the game, that's indicated by the plate's decoration - its color denotes the team owner. But it's something that fits so well into the design that people never realized it, until they were told about it. In the end, the idea of going with a full physical metaphor is one that should be done with care. Things that are very important risk becoming background noise, unless the player knows its importance. Originally, I wanted to avoid any kind of heads-up display in my game. In the end, I ended up adding it at the bottom to indicate your credits and bases owned, as well as the hideous out-of-place-and-still-not-obvious 'Call Waiter' button. But in hindsight, I should have gone with a simple HUD from the start, especially one that indicated each team's colors and general state of the game without the need for zooming in and out. Development Development went fast. But not fast enough. Even though I worked around 32+ hours for this Ludum Dare, the biggest problem I had to face in the end was overscoping. I had too much planned, and could not get it all done. Content-wise, I had several kinds of pasta planned - Wikipedia is just amazing in that regard, split into several different groups, from small Pastina to huge Pasta al forno. But because of time constraints, I ended up scratching most of them, and ended up with 5 different types of small pasta - barely something to start when talking about the evolution of Pasta. Pastas used in the game. Unfortunately, the macs where never used Which is one of the saddest things about the project, really. It had the framework and the features to allow an endless number of elements in there"
+                        fi
+                    elif [[ "${input}" == "1024" ]]; then
+                        if [[ "${model}" == "llama"* ]]; then
+                            prompt="It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I have not seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill. Once a pasta is in the vicinity of a plate,"
+                        elif [[ "${model}" == "baichuan"* ]]; then
+                            ids="195, 2287, 1414, 3904, 92323, 1377, 14448, 72, 2136, 1559, 1906, 2124, 20621, 92334, 3838, 1376, 1352, 1398, 1542, 11198, 92404, 1415, 10960, 92323, 1377, 1415, 1352, 5460, 72, 45986, 1415, 1352, 5460, 4067, 92323, 1619, 1438, 1527, 1375, 80244, 5957, 6548, 1412, 5163, 683 0, 1377, 1434, 1559, 1425, 1346, 4177, 35074, 72, 2945, 1414, 1346, 2718, 1406, 92404, 92322, 1795, 1375, 3861, 1670, 72, 1406, 1556, 1596, 1899, 1987, 11550, 92323, 2139, 1370, 1376, 32 06, 1352, 13972, 1817, 4105, 2126, 92395, 87412, 2611, 72, 37105, 18493, 1744, 1352, 10083, 1495, 5435, 1661, 1376, 1352, 37530, 15385, 1660, 1406, 1755, 1375, 4467, 72, 48091, 92323, 14 06, 1755, 1452, 4752, 1376, 1817, 3650, 1376, 2521, 1406, 4891, 1375, 2144, 92323, 31063, 26802, 92311, 63, 2982, 1436, 1346, 2718, 1376, 24411, 92395, 1621, 1712, 92323, 5384, 19316, 92 323, 8737, 2048, 1374, 3970, 92323, 16099, 1513, 1346, 2608, 63, 5180, 3019, 72, 1406, 1495, 11115, 1434, 1406, 2006, 5939, 1780, 10083, 2386, 1442, 72, 1593, 1352, 2163, 92323, 1352, 42 43, 1436, 1346, 10083, 1795, 2124, 19176, 4282, 92404, 1374, 1346, 2521, 1414, 1434, 15562, 1414, 1639, 88730, 72, 1745, 9493, 1899, 3473, 21901, 13065, 52904, 1744, 1750, 92323, 1436, 1 352, 1877, 30911, 13624, 66410, 35939, 72, 1956, 19109, 2183, 81273, 1414, 92323, 1374, 1751, 10306, 92323, 1346, 2220, 3937, 1376, 8961, 15562, 1376, 1346, 8318, 11043, 1346, 7143, 72, 2400, 1414, 1442, 1346, 2521, 74, 1593, 1346, 2521, 92323, 1438, 1859, 1375, 3829, 2982, 1375, 6282, 1452, 17869, 72, 3246, 3829, 6084, 2681, 1817, 15562, 1414, 11711, 1375, 1425, 1795, 72, 2615, 1438, 2754, 1352, 5322, 1375, 4714, 1847, 1375, 34734, 2982, 92323, 1442, 92404, 92319, 1538, 15562, 18766, 92311, 63, 1442, 92404, 92319, 1352, 16893, 1376, 25388, 2413, 92323 , 1352, 1365, 1620, 36465, 1508, 11101, 2277, 1375, 14635, 1352, 4752, 1376, 15562, 72, 18282, 1720, 92317, 22979, 1377, 1346, 16870, 2968, 12542, 92323, 1434, 92404, 92319, 1538, 2982, 1434, 13666, 6609, 1643, 1352, 2126, 2088, 72, 38891, 92323, 1751, 7074, 1381, 4123, 3791, 1777, 29772, 1817, 1375, 3554, 1495, 1538, 1436, 1817, 1406, 4891, 1375, 3554, 92323, 1619, 143 6, 1817, 1406, 2182, 1538, 72, 1406, 3753, 92404, 92313, 2030, 1375, 3554, 1452, 2124, 1700, 20361, 2413, 92404, 81273, 1377, 6441, 1421, 2181, 1442, 15562, 72, 1956, 1414, 1346, 4243, 9 2323, 1376, 3583, 92323, 1987, 1721, 12511, 1506, 1696, 1755, 1375, 4467, 72, 2145, 52618, 1508, 1352, 22398, 14448, 92323, 1538, 2009, 9128, 1375, 1656, 2386, 1442, 72, 1406, 92404, 923 22, 2596, 1352, 1911, 2705, 6518, 1495, 1899, 1352, 2065, 1376, 19649, 9579, 92323, 19810, 72, 3177, 3404, 92323, 1406, 1527, 1538, 3952, 1780, 8049, 2774, 1528, 1474, 1714, 9050, 31063,  72, 2116, 1382, 92323, 1528, 1414, 1829, 1346, 2431, 7749, 1377, 1887, 1346, 2310, 1406, 6187, 1538, 1375, 1425, 1449, 14985, 1436, 1352, 2521, 4752, 92323, 1377, 6891, 7714, 1375, 4714 , 10591, 1406, 3710, 1767, 1656, 1628, 72, 4009, 8396, 4752, 1495, 1375, 3554, 2982, 2003, 20027, 7156, 1375, 34734, 1375, 1346, 2763, 2963, 1619, 1755, 1773, 3650, 1376, 1365, 4242, 470 5, 1375, 4027, 1771, 1513, 4000, 1679, 72, 1406, 3650, 1376, 1755, 1528, 5690, 1376, 3558, 44103, 16283, 1374, 3970, 6322, 1346, 2161, 50038, 1500, 1346, 3970, 9383, 1458, 1522, 3443, 13 74, 92311, 92338, 92335, 92335, 92336, 92345, 1401, 10351, 62076, 1376, 3583, 92351, 1619, 1406, 8418, 92404, 92313, 2336, 1376, 21650, 1458, 2278, 92345, 5857, 92351, 34139, 1412, 1434,  72, 58958, 92319, 1738, 1751, 2763, 13556, 92323, 1449, 1609, 4654, 52260, 5939, 5773, 1972, 1812, 1352, 15562, 10083, 72, 2400, 1847, 1375, 1977, 1442, 1656, 74, 5599, 1438, 1352, 1372 , 3139, 92323, 1500, 11336, 1352, 58958, 74, 1481, 4120, 1377, 3895, 4752, 3642, 1375, 1643, 1899, 1751, 26645, 92323, 1642, 19810, 6302, 1643, 1352, 4752, 1376, 3206, 2982, 1670, 1352, 15562, 1376, 16870, 92314, 72, 1481, 1629, 1406, 3710, 1670, 1442, 1352, 1629, 1442, 41324, 1795, 1442, 1767, 1656, 92323, 1679, 1406, 6187, 1375, 1596, 1436, 1442, 72, 56573, 1646, 1436 , 1751, 21132, 1932, 63, 53865, 1451, 1612, 32546, 1458, 16349, 1696, 4627, 1352, 2124, 7376, 76994, 7255, 82749, 92404, 17865, 17179, 1412, 1751, 1598, 6888, 92351, 4163, 6043, 3020, 13 52, 6779, 92323, 1449, 1442, 5305, 1812, 1352, 4752, 1376, 3623, 3702, 9283, 1376, 46981, 16283, 2386, 1377, 4705, 1375, 34734, 3428, 1636, 4676, 1581, 63, 9951, 1974, 72, 1401, 17082, 4 752, 2328, 1495, 1434, 1352, 2521, 1767, 1656, 1375, 9543, 1847, 1352, 45634, 2312, 89442, 41739, 3642, 1375, 4482, 92311, 63, 1508, 30120, 1513, 1346, 8156, 12956, 5163, 72, 3177, 1352,  4752, 24976, 1629, 1500, 3129, 1812, 1528, 92345, 1438, 1484, 12396, 1346, 5163, 72, 2136, 1527, 1597, 2106, 15556, 92323, 1436, 1414, 1597, 2124, 12539, 92404, 72, 2945, 1484, 92311, 9 2358, 1721, 10831, 1474, 1352, 5163, 92323, 2470, 1436, 1609, 2106, 15556, 72, 4902, 15556, 1559, 43826, 3110, 9283, 1376, 46981, 72, 2136, 1616, 1679, 1508, 2124, 77777, 92404, 1771, 18 99, 1346, 12277, 72, 5751, 3127, 1382, 1484, 2928, 1765, 3663, 92399, 1773, 1484, 12143, 92323, 1773, 1484, 16310, 72, 2700, 1527, 32607, 2124, 27824, 92319, 3505, 1660, 1484, 3949, 1960 , 1513, 1597, 19002, 1458, 5035, 2104, 1436, 1346, 2717, 1376, 19002, 92351, 72, 10552, 87921, 92323, 1597, 3127, 1382, 2104, 16283, 2386, 72, 8635, 39428, 1414, 1375, 12116, 1375, 1721,  29614, 92323, 1374, 3281, 1375, 63701, 1771, 1458, 2146, 17869, 1376, 1352, 2521, 1414, 3623, 1597, 46981, 63701, 1581, 1352, 29614, 1415, 1352, 5163, 92351, 72, 2400, 1636, 1484, 2570,  33603, 92323, 1679, 1887, 2176, 87921, 92323, 1438, 1527, 1908, 3829, 1744, 1597, 46981, 1458, 25815, 51297, 92343, 1500, 11041, 92378, 2284, 31989, 92351, 72, 4902, 46981, 3956, 92404,  92313, 1795, 1721, 1808, 92404, 92319, 46981, 92323, 1679, 1841, 1636, 3099, 92323, 1636, 6450, 22308, 1474, 2470, 1721, 3428, 1661, 14585, 72, 2136, 1762, 19002, 1412, 1721, 3127, 1382 , 1597, 2106, 46981, 11451, 72, 10552, 1346, 46981, 1414, 1374, 1352, 61854, 1376, 1346, 15556, 92323, 196" 
+                        elif [[ "${model}" == "gptj-6b" ]]; then
+                            prompt="It is done, and submitted. You can play 'Survival of the Tastiest' on Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill. Once a pasta is in the vicinity of a plate, it starts conquering it for its team. It takes around 10 seconds for a plate to be conquered; less if more pasta from the same team are around. If pasta from other team are around, though, they get locked down in their attempt, unable to conquer the plate, until one of them die (think Battlefield's standard 'Conquest' mode). You get points every second for every plate you own. Over time, the concept"
+                        else
+                            prompt="It is done, and submitted. You can play 'Survival of the Tastiest' on the Android, and on the web. Playing on the web works, but you have to simulate multiple touch for table moving and that can be a bit confusing. There is a lot I'd like to talk about. I will go through every topic, insted of making the typical what went right/wrong list. Concept Working over the theme was probably one of the hardest tasks which I had to face. Originally, I had an idea of what kind of game I wanted to develop, gameplay wise - something with a lot of enemies/actors, simple graphics, maybe set in the space, controlled from a top-down view. I was confident that I could fit any theme around it. In the end, the problem with a theme like 'Evolution' in a game is that evolution is unassisted. It happens through several seemingly random mutations over time, with the most apt permutation surviving. This genetic car simulator is, in my opinion, a great example of actual evolution of a species facing a challenge. But is it a game? In a game, you need to control something to reach an objective. That control goes against what evolution is supposed to be like. If you allow the user to pick how to evolve something, it's not evolution anymore - it's the equivalent of intelligent design, the fable invented by creationists to combat the idea of evolution. Being agnostic and a Pastafarian, that's not something that rubbed me the right way. Hence, my biggest dillema when deciding what to create was not with what I wanted to create, but with what I did not. I didn't want to create an 'intelligent design' simulator and wrongly call it evolution. This is a problem, of course, every other contestant also had to face it. And judging by the entries submitted, not many managed to work around it. I'd say the only real solution was through the use of artificial selection, somehow. So far, I haven't seen any entry using this at its core gameplay. Alas, this is just a fun competition and after a while I decided not to be as strict with the game idea, and allowed myself to pick whatever I thought would work out. My initial idea was to create something where humanity tried to evolve to a next level, but had some kind of foe trying to stop them from doing so. I kind of had this image of human souls flying in space towards a monolith or a space baby (all based in 2001: A Space Odyssey of course) but I couldn't think of compelling (read: serious) mechanics for that. Borgs were my next inspiration, as their whole hypothesis fit pretty well into the evolution theme. But how to make it work? Are you the borg, or fighting the Borg? The third and final idea came to me through my girlfriend, who somehow gave me the idea of making something about the evolution of Pasta. The more I thought about it the more it sounded like it would work, so I decided to go with it. Conversations with my inspiring co-worker Roushey (who also created the 'Mechanical Underdogs' signature logo for my intros) further matured the concept, as it involved into the idea of having individual pieces of pasta flying around and trying to evolve until they became all-powerful. A secondary idea here was that the game would work to explain how the Flying Spaghetti Monster came to exist - by evolving from a normal dinner table. So the idea evolved more or less into this: you are sitting a table. You have your own plate, with is your 'base'. There are 5 other guests at the table, each with their own plate. Your plate can spawn little pieces of pasta. You do so by 'ordering' them through a menu. Some pastas are better than others; some are faster, some are stronger. They have varying 'costs', which are debited from your credits (you start with a number of credits). Once spawned, your pastas start flying around. Their instinct is to fly to other plates, in order to conquer them (the objective of the game is having your pasta conquer all the plates on the table). But they are really autonomous, so after being spawned, you have no control over your pasta (think DotA or LoL creeps). Your pasta doesn't like other people's pasta, so if they meet, they shoot sauce at each other until one dies. You get credits for other pastas your own pasta kill. Once a pasta is in the vicinity of a plate, it starts conquering it for its team. It takes around 10 seconds for a plate to be conquered; less if more pasta from the same team are around."
+                        fi
+                    fi
+                    ctx=$(($output + $input + 10))
+                    logs_file="${model}-${precision}-${cores_per_instance}-${input}-${output}.log"
+                    ## prepare model.bin
+                    quantized_model="${model}-${precision}.bin"
+                    if [[ ! -f ${quantized_model} ]]; then
+                        if [[ ${precision} == "q4_j" ]]; then
+                            ${quant_script} --model_file ${f32_model} --out_file ${quantized_model} --nthread $cores_per_instance --weight_dtype int4 --group_size 128 --scale_dtype fp32 --compute_dtype fp32 --alg sym
+                        else
+                            echo "Not supported precision on TP"
+                            exit 1
+                        fi
+                    fi
+                    ## run inference
+                    export LANG=en_US.UTF-8
+                    export LC_ALL=en_US.UTF-8
+                    if [[ "${model}" == "baichuan"* ]]; then
+                        echo $infer_cmd -t $cores_per_instance -c ${ctx} -n ${output} -m ${model}-${precision}.bin --ids \"$ids\"  >  run_${model}.sh
+                    else
+                        echo $infer_cmd --seed 1234 -t $cores_per_instance -b 2047 -c ${ctx} -n ${output} -m ${model}-${precision}.bin -p \"$prompt\"  >  run_${model}.sh
+                    fi
+                    # taskset -c 0-$(($cores_per_instance * 1 - 1)) bash run_${model}.sh 2>&1 | tee ${WORKING_DIR}/"origin_"${logs_file}
+                    # TP_LOCAL_SIZE=2 mpirun -n 1 taskset -c 0-$(($cores_per_instance * 1 - 1)) bash run_${model}.sh  : -n 1 taskset -c $(($cores_per_instance))-$(($cores_per_instance * 2- 1)) bash run_${model}.sh  2>&1 | tee ${WORKING_DIR}/"tp_"${logs_file}
+                    # collect_perf "origin_"${logs_file} 1 ${precision} ${input}
+                    # collect_perf "tp_"${logs_file} 2 ${precision} ${input}
+                    check_accuracy "origin_"${logs_file} "tp_"${logs_file} "${prompt}" 2
+                    exit 1
+                done
+            done
+        done
+    done
+    conda deactivate >/dev/null 2>&1
+}
+
+function check_accuracy {
+    origin_output=$1
+    tp_output=$2
+    prompt=$3
+    rank=$4
+    last_words=$(echo "$prompt" | awk '{for(i=NF-2; i<NF; i++) printf $i " ";printf $NF}')
+    origin_first_token=$(grep -oP "${last_words} \K\S+" $origin_output)
+    tp_first_token=$(grep -oP "${last_words} \K\S+" $tp_output | awk -v rank="$rank" 'NR==rank {print $1}')
+    echo "origin first token is $origin_first_token and tp first token is ${tp_first_token}\n"
+    if [[ "$origin_first_token" == "$tp_first_token" ]]; then
+        echo "accuracy is good"
+    else
+        echo "accuracy not good!"
+        exit 1
+    fi
+}
+
+function collect_perf {
+    # latency
+    log_dir="${WORKING_DIR}/$1"
+    rank=$2
+    precision=$3
+    input_tokens=$4
+    eval_time=($(grep -i 'eval time' ${log_dir} | grep -v "prompt" | grep -oP '\(\s*\K[0-9]+\.[0-9]+(?= ms per token)')) 
+    first_token_time=($(grep -i 'eval time' ${log_dir} | grep "prompt" | grep -o -E '=\s*[0-9]+\.[0-9]+ ms' | awk '{print $2}'))
+    printf "${model},${precision},${rank},${input_tokens},${first_token_time},${eval_time},${log_dir}\n" | tee -a ${WORKING_DIR}/tp_summary.log
+    set +x
+    echo -e "\n\n-------- Summary --------"
+    sed -n '1p;$p' ${WORKING_DIR}/tp_summary.log | column -t -s ','
+}
+
+
+main $@ 2>&1 | tee ${WORKING_DIR}/launch.log
diff --git a/neural_speed/application/main_run.cpp b/neural_speed/application/main_run.cpp
index b48d9d192..7ece0b33d 100644
--- a/neural_speed/application/main_run.cpp
+++ b/neural_speed/application/main_run.cpp
@@ -442,6 +442,15 @@ int main(int argc, char** argv) {  // NOLINT
     model_reset_timings(ctx);
   }
 
+#ifdef NE_TP_MODEL
+  // sync here to make multi node run into inference at the same time
+  parallel_context* p_ctx = init_parallel_context();
+  if (get_tp_size(p_ctx) > 1) {
+    barrier(p_ctx);
+  }
+
+#endif
+
   while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
     // predict
     if (embd.size() > 0) {
diff --git a/neural_speed/core/layers/mha_dense.cpp b/neural_speed/core/layers/mha_dense.cpp
index 8f49a722a..c9f7e8dcb 100644
--- a/neural_speed/core/layers/mha_dense.cpp
+++ b/neural_speed/core/layers/mha_dense.cpp
@@ -1344,8 +1344,19 @@ class MHAStableInterface {
     const auto group_heads = p.head_num / p.heads_kv;
     const auto sl_diff = p.sl_kv - p.sl_q;
 
+    // TP will need the real rank oder of k
+    int32_t k_offset = 0;
+    int32_t log_head_num = p.head_num;
+#ifdef NE_TP_MODEL
+    parallel_context* p_ctx = init_parallel_context();
+    int32_t world_size = get_tp_size(p_ctx);
+    int32_t rank = get_tp_rank(p_ctx);
+    if (world_size > 1) k_offset += rank * p.head_num;
+    log_head_num *= world_size;
+#endif
+
     // alibi slope
-    const int n_heads_log2_floor = 1 << static_cast<int>(floor(log2(p.head_num)));
+    const int n_heads_log2_floor = 1 << static_cast<int>(floor(log2(log_head_num)));
     const float m0 = powf(2.0f, -(8.f) / n_heads_log2_floor);
     const float m1 = powf(2.0f, -(8.f / 2.0f) / n_heads_log2_floor);
 
@@ -1381,9 +1392,10 @@ class MHAStableInterface {
           const int ihkv = ihn / group_heads;
           const int m_size = std::min(M_TILE, p.sl_q - i_m);
 
-          const auto alibi_ihn_m = !is_alibi                    ? 0.f
-                                   : (ihn < n_heads_log2_floor) ? powf(m0, ihn + 1)
-                                                                : powf(m1, 2 * (ihn - n_heads_log2_floor) + 1);
+          const auto alibi_ihn_m = !is_alibi ? 0.f
+                                   : (ihn + k_offset < n_heads_log2_floor)
+                                       ? powf(m0, ihn + k_offset + 1)
+                                       : powf(m1, 2 * (ihn + k_offset - n_heads_log2_floor) + 1);
 
           float s_max[M_TILE]{};  // maximum for each row of the S matrix
           std::fill_n(s_max, M_TILE, -INFINITY);
@@ -1718,7 +1730,17 @@ void bestla_fusion_attn_forward_ref(const attn_fwd_args_t<Q_T, K_T, V_T, DST_T>&
   const auto ROWPACK = p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK4   ? 4
                        : p.V_layout == ATTN_FWD_LAYOUT_NTILE48_ROWPACK2 ? 2
                                                                         : 0;
-  const int n_heads_log2_floor = 1 << static_cast<int>(floor(log2(p.head_num)));
+  // TP will need the real rank oder of k
+  int32_t k_offset = 0;
+  int32_t log_head_num = p.head_num;
+#ifdef NE_TP_MODEL
+  parallel_context* p_ctx = init_parallel_context();
+  int32_t world_size = get_tp_size(p_ctx);
+  int32_t rank = get_tp_rank(p_ctx);
+  if (world_size > 1) k_offset += rank * p.head_num;
+  log_head_num = p.head_num * world_size;
+#endif
+  const int n_heads_log2_floor = 1 << static_cast<int>(floor(log2(log_head_num)));
   const float m0 = powf(2.0f, -(8.f) / n_heads_log2_floor);
   const float m1 = powf(2.0f, -(8.f / 2.0f) / n_heads_log2_floor);
 
@@ -1737,9 +1759,10 @@ void bestla_fusion_attn_forward_ref(const attn_fwd_args_t<Q_T, K_T, V_T, DST_T>&
         const auto unmasked = is_causal ? sl_diff + i + 1 : p.sl_kv;
         const auto curr_row = std::unique_ptr<float[]>(new float[unmasked]);
 
-        const auto alibi_ihn_m = !is_alibi                    ? 0.f
-                                 : (ihn < n_heads_log2_floor) ? powf(m0, ihn + 1)
-                                                              : powf(m1, 2 * (ihn - n_heads_log2_floor) + 1);
+        const auto alibi_ihn_m = !is_alibi ? 0.f
+                                 : (ihn + k_offset < n_heads_log2_floor)
+                                     ? powf(m0, ihn + k_offset + 1)
+                                     : powf(m1, 2 * (ihn + k_offset - n_heads_log2_floor) + 1);
 
         // Q x K
         float row_max = -INFINITY;
diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c
index 18f51e3ae..8ce02b524 100644
--- a/neural_speed/core/ne_layers.c
+++ b/neural_speed/core/ne_layers.c
@@ -1324,13 +1324,9 @@ struct ne_tensor* ne_debug_op(struct ne_context* ctx, struct ne_tensor* a, ne_de
   return result;
 }
 
-struct ne_tensor* ne_dup(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_dup_impl(ctx, a, false);
-}
+struct ne_tensor* ne_dup(struct ne_context* ctx, struct ne_tensor* a) { return ne_dup_impl(ctx, a, false); }
 
-struct ne_tensor* ne_dup_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_dup_impl(ctx, a, true);
-}
+struct ne_tensor* ne_dup_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_dup_impl(ctx, a, true); }
 
 // ne_add
 
@@ -1683,13 +1679,9 @@ struct ne_tensor* ne_sqr_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_sqr(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_sqr_impl(ctx, a, false);
-}
+struct ne_tensor* ne_sqr(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqr_impl(ctx, a, false); }
 
-struct ne_tensor* ne_sqr_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_sqr_impl(ctx, a, true);
-}
+struct ne_tensor* ne_sqr_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqr_impl(ctx, a, true); }
 
 // ne_sqrt
 
@@ -1710,13 +1702,9 @@ struct ne_tensor* ne_sqrt_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_sqrt(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_sqrt_impl(ctx, a, false);
-}
+struct ne_tensor* ne_sqrt(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqrt_impl(ctx, a, false); }
 
-struct ne_tensor* ne_sqrt_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_sqrt_impl(ctx, a, true);
-}
+struct ne_tensor* ne_sqrt_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sqrt_impl(ctx, a, true); }
 
 // ne_log
 
@@ -1737,13 +1725,9 @@ struct ne_tensor* ne_log_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_log(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_log_impl(ctx, a, false);
-}
+struct ne_tensor* ne_log(struct ne_context* ctx, struct ne_tensor* a) { return ne_log_impl(ctx, a, false); }
 
-struct ne_tensor* ne_log_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_log_impl(ctx, a, true);
-}
+struct ne_tensor* ne_log_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_log_impl(ctx, a, true); }
 
 // ne_sum
 
@@ -1853,13 +1837,9 @@ struct ne_tensor* ne_abs_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_abs(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_abs_impl(ctx, a, false);
-}
+struct ne_tensor* ne_abs(struct ne_context* ctx, struct ne_tensor* a) { return ne_abs_impl(ctx, a, false); }
 
-struct ne_tensor* ne_abs_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_abs_impl(ctx, a, true);
-}
+struct ne_tensor* ne_abs_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_abs_impl(ctx, a, true); }
 
 // ne_sgn
 
@@ -1880,13 +1860,9 @@ struct ne_tensor* ne_sgn_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_sgn(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_sgn_impl(ctx, a, false);
-}
+struct ne_tensor* ne_sgn(struct ne_context* ctx, struct ne_tensor* a) { return ne_sgn_impl(ctx, a, false); }
 
-struct ne_tensor* ne_sgn_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_sgn_impl(ctx, a, true);
-}
+struct ne_tensor* ne_sgn_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_sgn_impl(ctx, a, true); }
 
 // ne_neg
 
@@ -1907,13 +1883,9 @@ struct ne_tensor* ne_neg_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_neg(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_neg_impl(ctx, a, false);
-}
+struct ne_tensor* ne_neg(struct ne_context* ctx, struct ne_tensor* a) { return ne_neg_impl(ctx, a, false); }
 
-struct ne_tensor* ne_neg_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_neg_impl(ctx, a, true);
-}
+struct ne_tensor* ne_neg_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_neg_impl(ctx, a, true); }
 
 // ne_step
 
@@ -1934,13 +1906,9 @@ struct ne_tensor* ne_step_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_step(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_step_impl(ctx, a, false);
-}
+struct ne_tensor* ne_step(struct ne_context* ctx, struct ne_tensor* a) { return ne_step_impl(ctx, a, false); }
 
-struct ne_tensor* ne_step_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_step_impl(ctx, a, true);
-}
+struct ne_tensor* ne_step_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_step_impl(ctx, a, true); }
 
 // ne_relu
 
@@ -1961,13 +1929,9 @@ struct ne_tensor* ne_relu_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_relu(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_relu_impl(ctx, a, false);
-}
+struct ne_tensor* ne_relu(struct ne_context* ctx, struct ne_tensor* a) { return ne_relu_impl(ctx, a, false); }
 
-struct ne_tensor* ne_relu_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_relu_impl(ctx, a, true);
-}
+struct ne_tensor* ne_relu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_relu_impl(ctx, a, true); }
 
 // ne_gelu
 
@@ -1988,13 +1952,9 @@ struct ne_tensor* ne_gelu_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_gelu(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_gelu_impl(ctx, a, false);
-}
+struct ne_tensor* ne_gelu(struct ne_context* ctx, struct ne_tensor* a) { return ne_gelu_impl(ctx, a, false); }
 
-struct ne_tensor* ne_gelu_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_gelu_impl(ctx, a, true);
-}
+struct ne_tensor* ne_gelu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_gelu_impl(ctx, a, true); }
 
 // ne_silu
 
@@ -2015,13 +1975,9 @@ struct ne_tensor* ne_silu_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_silu(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_silu_impl(ctx, a, false);
-}
+struct ne_tensor* ne_silu(struct ne_context* ctx, struct ne_tensor* a) { return ne_silu_impl(ctx, a, false); }
 
-struct ne_tensor* ne_silu_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_silu_impl(ctx, a, true);
-}
+struct ne_tensor* ne_silu_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_silu_impl(ctx, a, true); }
 
 // ne_silu_back
 
@@ -2063,13 +2019,9 @@ struct ne_tensor* ne_norm_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_norm(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_norm_impl(ctx, a, false);
-}
+struct ne_tensor* ne_norm(struct ne_context* ctx, struct ne_tensor* a) { return ne_norm_impl(ctx, a, false); }
 
-struct ne_tensor* ne_norm_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_norm_impl(ctx, a, true);
-}
+struct ne_tensor* ne_norm_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_norm_impl(ctx, a, true); }
 
 struct ne_tensor* ne_rms_norm_impl(struct ne_context* ctx, struct ne_tensor* a, bool inplace, float eps) {
   bool is_node = false;
@@ -2415,13 +2367,9 @@ struct ne_tensor* ne_cont_impl(struct ne_context* ctx, struct ne_tensor* a, bool
   return result;
 }
 
-struct ne_tensor* ne_cont(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_cont_impl(ctx, a, false);
-}
+struct ne_tensor* ne_cont(struct ne_context* ctx, struct ne_tensor* a) { return ne_cont_impl(ctx, a, false); }
 
-struct ne_tensor* ne_cont_inplace(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_cont_impl(ctx, a, true);
-}
+struct ne_tensor* ne_cont_inplace(struct ne_context* ctx, struct ne_tensor* a) { return ne_cont_impl(ctx, a, true); }
 
 // ne_reshape
 
@@ -2968,9 +2916,7 @@ struct ne_tensor* ne_soft_max_impl(struct ne_context* ctx, struct ne_tensor* a,
   return result;
 }
 
-struct ne_tensor* ne_soft_max(struct ne_context* ctx, struct ne_tensor* a) {
-  return ne_soft_max_impl(ctx, a, false);
-}
+struct ne_tensor* ne_soft_max(struct ne_context* ctx, struct ne_tensor* a) { return ne_soft_max_impl(ctx, a, false); }
 
 struct ne_tensor* ne_soft_max_inplace(struct ne_context* ctx, struct ne_tensor* a) {
   return ne_soft_max_impl(ctx, a, true);
@@ -7653,7 +7599,7 @@ static void ne_compute_forward_alibi_f32(const struct ne_compute_params* params,
   }
 
   const int n_past = ((int32_t*)src1->data)[0];
-  const int n_head = ((int32_t*)src1->data)[1];
+  int n_head = ((int32_t*)src1->data)[1];
   const float max_bias = ((float*)src1->data)[2];
 
   assert(n_past >= 0);
@@ -7674,6 +7620,15 @@ static void ne_compute_forward_alibi_f32(const struct ne_compute_params* params,
   assert(nb0 == sizeof(float));
   assert(ne1 + n_past == ne0);
   (void)n_past;
+  // TP will need the real rank oder of k
+  int32_t k_offset = 0;
+#ifdef NE_TP_MODEL
+  parallel_context* p_ctx = init_parallel_context();
+  int32_t world_size = get_tp_size(p_ctx);
+  int32_t rank = get_tp_rank(p_ctx);
+  if (world_size > 1) k_offset += rank * n_head;
+  n_head *= world_size;
+#endif
 
   // add alibi to src0 (KQ_scaled)
   const int n_heads_log2_floor = 1 << (int)floor(log2(n_head));
@@ -7691,10 +7646,10 @@ static void ne_compute_forward_alibi_f32(const struct ne_compute_params* params,
 
         float m_k;
 
-        if (k < n_heads_log2_floor) {
-          m_k = powf(m0, k + 1);
+        if (k + k_offset < n_heads_log2_floor) {
+          m_k = powf(m0, k + k_offset + 1);
         } else {
-          m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
+          m_k = powf(m1, 2 * (k + k_offset - n_heads_log2_floor) + 1);
         }
 
         pdst[0] = (i - ne0 + 1) * m_k + src[0];
@@ -7714,7 +7669,7 @@ static void ne_compute_forward_alibi_f16(const struct ne_compute_params* params,
   }
 
   const int n_past = ((int32_t*)src1->data)[0];
-  const int n_head = ((int32_t*)src1->data)[1];
+  int n_head = ((int32_t*)src1->data)[1];
   const float max_bias = ((float*)src1->data)[2];
 
   assert(n_past >= 0);
@@ -7735,7 +7690,15 @@ static void ne_compute_forward_alibi_f16(const struct ne_compute_params* params,
   assert(nb0 == sizeof(ne_fp16_t));
   assert(ne1 + n_past == ne0);
   (void)n_past;
-
+  // TP will need the real rank oder of k
+  int32_t k_offset = 0;
+#ifdef NE_TP_MODEL
+  parallel_context* p_ctx = init_parallel_context();
+  int32_t world_size = get_tp_size(p_ctx);
+  int32_t rank = get_tp_rank(p_ctx);
+  if (world_size > 1) k_offset += rank * n_head;
+  n_head *= world_size;
+#endif
   // add alibi to src0 (KQ_scaled)
   const int n_heads_log2_floor = 1 << (int)floor(log2(n_head));
 
@@ -7753,10 +7716,10 @@ static void ne_compute_forward_alibi_f16(const struct ne_compute_params* params,
 
         float m_k;
 
-        if (k < n_heads_log2_floor) {
-          m_k = powf(m0, k + 1);
+        if (k + k_offset < n_heads_log2_floor) {
+          m_k = powf(m0, k + k_offset + 1);
         } else {
-          m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
+          m_k = powf(m1, 2 * (k + k_offset - n_heads_log2_floor) + 1);
         }
 
         // we return F32
diff --git a/neural_speed/core/parallel_context.cpp b/neural_speed/core/parallel_context.cpp
index b3a4284f2..feff87f3a 100644
--- a/neural_speed/core/parallel_context.cpp
+++ b/neural_speed/core/parallel_context.cpp
@@ -14,6 +14,7 @@
 #include <mpi.h>
 #include "oneapi/ccl.hpp"
 #include "parallel_context.h"
+#include "shared_memory_ccl.hpp"
 
 class parallel_class {
  public:
@@ -26,7 +27,12 @@ class parallel_class {
       return instance_p;
     }
   }
-  ~parallel_class() { delete pcomm; }
+  ~parallel_class() {
+    delete pcomm;
+    if (use_shm) {
+      shared_close(shm_name, cbuffer, world_size * sizeof(struct ccl_buffer));
+    }
+  }
 
   bool is_master() { return rank == 0; }
 
@@ -37,8 +43,13 @@ class parallel_class {
 
   // From some example code of oneCCL, inplace reducing is supported
   void reduce_add(float* sendBuf, float* recvBuf, size_t count) {
-    ccl::allreduce(sendBuf, recvBuf, count, ccl::reduction::sum, *pcomm).wait();
+    if (use_shm) {
+      shm_all_reduce(sendBuf, recvBuf, count, rank, world_size);
+    } else {
+      ccl::allreduce(sendBuf, recvBuf, count, ccl::reduction::sum, *pcomm).wait();
+    }
   }
+
   void broadcast(float* buf, size_t count) {
     int root = 0;  // assume always broadcast from master
     ccl::broadcast(buf, count, root, *pcomm).wait();
@@ -71,6 +82,32 @@ class parallel_class {
 
     rank = pcomm->rank();
     world_size = pcomm->size();
+
+    // Check whether all ranks is on the same physical machine.
+    // If true use SHM allreduce
+    auto local_size = std::getenv("TP_LOCAL_SIZE");
+    if (local_size != NULL) {
+      use_shm = std::stoi(local_size) == world_size;
+    }
+    if (use_shm) {
+      void* shared_ptr = nullptr;
+      snprintf(shm_name, 100, "%s_%d", "shared_memory_tp", getuid());
+      if (rank == 0) {
+        cbuffer = (struct ccl_buffer*)malloc(world_size * sizeof(struct ccl_buffer));
+        shared_ptr = shared_create(shm_name, cbuffer, world_size * sizeof(struct ccl_buffer));
+        assert(shared_ptr != nullptr);
+        cbuffer = (struct ccl_buffer*)(shared_ptr);
+        for (int i = 0; i < world_size; i++) {
+          cbuffer[i].state = ccl_begin;
+        }
+      }
+      ccl::barrier(*pcomm).wait();
+      if (rank != 0) {
+        shared_ptr = shared_open(shm_name, world_size * sizeof(struct ccl_buffer));
+        assert(shared_ptr != nullptr);
+        cbuffer = (struct ccl_buffer*)shared_ptr;
+      }
+    }
   }
   static void mpi_finalize() {
     int is_finalized = 0;
@@ -81,6 +118,8 @@ class parallel_class {
     }
   }
 
+  bool use_shm = false;
+  char shm_name[100];
   int world_size;
   int rank;
 
diff --git a/neural_speed/core/shared_memory_ccl.hpp b/neural_speed/core/shared_memory_ccl.hpp
new file mode 100644
index 000000000..f3586d21d
--- /dev/null
+++ b/neural_speed/core/shared_memory_ccl.hpp
@@ -0,0 +1,136 @@
+//  Copyright (c) 2023 Intel Corporation
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+#include <assert.h>
+#include <fcntl.h>
+#include <immintrin.h>
+#include <omp.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include "oneapi/ccl.hpp"
+#include "layers/ele_wise.h"
+
+// states for collectives
+enum ccl_state {
+  ccl_begin = 0,
+  copy_in_done,
+  reduce_done,
+  copy_out_done,
+};
+
+void* shared_open(const char* name, size_t nbytes) {
+  int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR);
+  if (d != -1) {
+    return mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, d, 0);
+  } else {
+    printf("shared_open %s failed\n", name);
+    return nullptr;
+  }
+}
+
+void* shared_create(const char* name, void* bytes, size_t nbytes) {
+  int d = shm_open(name, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+  if ((d != -1) && (nbytes = write(d, bytes, nbytes))) {
+    return mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, d, 0);
+  } else {
+    printf("shared_create %s failed\n", name);
+    return nullptr;
+  }
+}
+
+void shared_close(const char* name, void* bytes, size_t nbytes) {
+  int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR);
+  if (d != -1) {
+    munmap(bytes, nbytes);
+    shm_unlink(name);
+  }
+}
+
+#define CCL_BUF_SIZE 1048576
+struct ccl_buffer {
+  enum ccl_state state;
+  char data[CCL_BUF_SIZE];
+};
+struct ccl_buffer* cbuffer;
+
+void wait_state_equal(int index, enum ccl_state state) {
+  volatile enum ccl_state* state_ptr = &(cbuffer[index].state);
+  while (*state_ptr != state) {
+    _mm_pause();
+  }
+}
+
+void wait_state_change(int index, enum ccl_state state) {
+  volatile enum ccl_state* state_ptr = &(cbuffer[index].state);
+  while (*state_ptr == state) {
+    _mm_pause();
+  }
+}
+
+void reduce_fp32_buffers(int num_elements, int num_buffers, struct ccl_buffer* cbuffer) {
+  auto rank_0 = (float*)(cbuffer[0].data);
+  // all buffers reduce to rank 0 and then broadcast
+  for (int i = 1; i < num_buffers; ++i) {
+    ne_vec_add_f32(num_elements, rank_0, rank_0, (float*)(cbuffer[i].data));
+  }
+}
+
+void reduce_buffers(struct ccl_buffer* cbuffer, int num_elements, int num_buffers) {
+  // TODO only support fp32 reduce, add other data type if needed
+  if (num_buffers >= 2) {
+    reduce_fp32_buffers(num_elements, num_buffers, cbuffer);
+  } else {
+    assert(!"Not supported buffer number.");
+  }
+}
+
+void shm_all_reduce(float* sendBuf, float* recvBuf, size_t count, size_t rank, size_t world_size) {
+  for (int offset = 0; offset < count * sizeof(float); offset += CCL_BUF_SIZE) {
+    auto send_ptr = (char*)sendBuf + offset;
+    auto recv_ptr = (char*)recvBuf + offset;
+    size_t chunk_size = std::min(count * sizeof(float) - offset, (size_t)CCL_BUF_SIZE);
+    size_t chunk_count = chunk_size / sizeof(float);
+
+    memcpy(cbuffer[rank].data, send_ptr, chunk_size);
+    cbuffer[rank].state = copy_in_done;
+
+    if (rank == 0) {
+      // compute allreduce result on rank 0
+      for (int i = 1; i < world_size; i++) {
+        // wait until the other rank copy the buffer
+        wait_state_equal(i, copy_in_done);
+      }
+      reduce_buffers(cbuffer, chunk_count, world_size);
+      cbuffer[rank].state = reduce_done;
+      memcpy(recv_ptr, cbuffer[0].data, chunk_size);
+    }
+    if (rank != 0) {
+      wait_state_equal(0, reduce_done);
+      memcpy(recv_ptr, cbuffer[0].data, chunk_size);
+      cbuffer[rank].state = copy_out_done;
+    }
+    if (rank == 0) {
+      for (int i = 1; i < world_size; i++) {
+        wait_state_equal(i, copy_out_done);
+      }
+      cbuffer[rank].state = ccl_begin;
+    }
+    if (rank != 0) {
+      // if rank 0 spin too fast it could be in state 1 of next allreduce
+      // in this case wait_state_change(0, 0) may cause deadlock
+      // what we are certain is when rank 0 finishes the state won't be 2
+      wait_state_change(0, reduce_done);
+      cbuffer[rank].state = ccl_begin;
+    }
+  }
+}
diff --git a/neural_speed/models/baichuan/baichuan.cpp b/neural_speed/models/baichuan/baichuan.cpp
index 589d64016..03b1f4a10 100644
--- a/neural_speed/models/baichuan/baichuan.cpp
+++ b/neural_speed/models/baichuan/baichuan.cpp
@@ -71,12 +71,26 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
   const bool is_ring_full = shift_roped_k && n_total > n_past;
   NE_ASSERT(("Shift-RoPE-K to be implemented for AliBi!", !is_ring_full));
 
-  const int n_head = hparams.n_head;
+  int n_head = hparams.n_head;
   const int n_vocab = hparams.n_vocab;
   const int head_size = n_embd / n_head;
   const int n_rot = n_embd / n_head / 2;
   const float attn_scale = 1.f / std::sqrt(head_size);
 
+  bool enable_tp = false;
+#ifdef NE_TP_MODEL
+  parallel_context* p_ctx = init_parallel_context();
+  int32_t world_size = get_tp_size(p_ctx);
+  int32_t rank = get_tp_rank(p_ctx);
+  enable_tp = world_size > 1 ? true : false;
+
+  // after TP the Q K n_head will become 1/world_size
+  if (enable_tp) {
+    n_head /= world_size;
+  }
+#endif
+  int hidden_size = head_size * n_head;
+
   auto& mem_per_token = lctx.mem_per_token;
   auto& buf_compute = lctx.buf_compute;
 
@@ -122,7 +136,6 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
   }
 
   struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd);
-  int hidden_size = inpL->ne[0];
   NE_ASSERT(N == inpL->ne[1]);
 
   for (int il = 0; il < n_layer; ++il) {
@@ -229,13 +242,19 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
         ne_attn_flags_t attn_flags = NE_ATTN_FLAG_IS_ALIBI8;
         if (n_past == 0) attn_flags |= NE_ATTN_FLAG_IS_CAUSAL;  // no causal mask on next-token cases
         struct ne_tensor* KQV_Out = ne_flash_attn(ctx0, query_layer, key_layer, value_layer, attn_scale, attn_flags);
-        cur = ne_view_2d(ctx0, KQV_Out, n_embd, N, n_embd * ne_element_size(KQV_Out), 0);
+        cur = ne_view_2d(ctx0, KQV_Out, hidden_size, N, hidden_size * ne_element_size(KQV_Out), 0);
 
         // F32 mul_mat
         cur = ne_mul_mat(ctx0, model.layers[il].attn[1], cur);
       }
     }
 
+#ifdef NE_TP_MODEL
+    if (enable_tp) {
+      cur = ne_all_reduce(ctx0, cur);
+    }
+#endif
+
     lctx.use_buf(ctx0, 1);
     cur = ne_add_inplace(ctx0, cur, residual);
     residual = cur;
@@ -258,6 +277,11 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
       mlp_output = ne_mul(ctx0, gate, up);
       mlp_output = ne_mul_mat(ctx0, model.layers[il].ffn[1], mlp_output);
     }
+#ifdef NE_TP_MODEL
+    if (enable_tp) {
+      mlp_output = ne_all_reduce(ctx0, mlp_output);
+    }
+#endif
 
     inpL = ne_add_inplace(ctx0, mlp_output, residual);
   }
@@ -273,7 +297,7 @@ static bool baichuan_model_eval_internal(model_context* ctx, const model_input*
 
   lctx.use_buf(ctx0, -1);
   if (embd->ne[0] > 1) {
-    inpL = ne_view_1d(ctx0, inpL, hidden_size, (embd->ne[0] - 1) * hidden_size * ne_element_size(inpL));
+    inpL = ne_view_1d(ctx0, inpL, n_embd, (embd->ne[0] - 1) * n_embd * ne_element_size(inpL));
   }
 
   // lm_head
diff --git a/neural_speed/models/chatglm/chatglm2.cpp b/neural_speed/models/chatglm/chatglm2.cpp
index 03acfd19b..b27210eb1 100644
--- a/neural_speed/models/chatglm/chatglm2.cpp
+++ b/neural_speed/models/chatglm/chatglm2.cpp
@@ -72,14 +72,28 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
   const bool shift_roped_k = lctx.shift_roped_k;
   const bool is_ring_full = shift_roped_k && n_total > n_past;
   const int n_cached = shift_roped_k ? std::min(n_total + N, n_ctx) : (n_past + N);  // #tokens cached after kv-append
-  const int n_head = hparams.n_head;
+  int n_head = hparams.n_head;
   const int n_vocab = hparams.n_vocab;
   const int head_size = n_embd / n_head;
   const int n_rot = head_size / 2;
   const int mqa_scale = n_head / hparams.multi_query_group_num;
-  const int num_kv_heads = hparams.multi_query_group_num;
+  int num_kv_heads = hparams.multi_query_group_num;
+
+  bool enable_tp = false;
+#ifdef NE_TP_MODEL
+  parallel_context* p_ctx = init_parallel_context();
+  int32_t world_size = get_tp_size(p_ctx);
+  int32_t rank = get_tp_rank(p_ctx);
+  enable_tp = world_size > 1 ? true : false;
+
+  // after TP the Q K n_head will become 1/world_size
+  if (enable_tp) {
+    n_head /= world_size;
+    num_kv_heads /= world_size;
+  }
+#endif
 
-  const int hidden_size = n_embd;
+  const int hidden_size = head_size * n_head;
   const int num_attention_heads = n_head;
 
   auto& mem_per_token = lctx.mem_per_token;
@@ -265,10 +279,15 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
         ne_attn_flags_t attn_flags = NE_ATTN_FLAG_NONE;
         if (n_total == 0 || !shift_roped_k) attn_flags |= NE_ATTN_FLAG_IS_CAUSAL;  // no causal mask on next-token cases
         struct ne_tensor* KQV_Out = ne_flash_attn(ctx0, query_layer, key_layer, value_layer, attn_scale, attn_flags);
-        cur = ne_view_2d(ctx0, KQV_Out, n_embd, N, n_embd * ne_element_size(KQV_Out), 0);
+        cur = ne_view_2d(ctx0, KQV_Out, hidden_size, N, hidden_size * ne_element_size(KQV_Out), 0);
       }
       cur = ne_mul_mat(ctx0, model.layers[il].attn[2], cur);
     }
+#ifdef NE_TP_MODEL
+    if (enable_tp) {
+      cur = ne_all_reduce(ctx0, cur);
+    }
+#endif
 
     lctx.use_buf(ctx0, 1);
 
@@ -290,6 +309,12 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
     mlp_output = ne_mul(ctx0, x0, x1);
     mlp_output = ne_mul_mat(ctx0, model.layers[il].ffn[1], mlp_output);
 
+#ifdef NE_TP_MODEL
+    if (enable_tp) {
+      mlp_output = ne_all_reduce(ctx0, mlp_output);
+    }
+#endif
+
     inpL = ne_add(ctx0, hidden_states, mlp_output);
   }
 
@@ -306,7 +331,7 @@ static bool chatglm_model_eval_internal(model_context* ctx, const model_input* i
 
   lctx.use_buf(ctx0, -1);
   if (embd->ne[0] > 1) {
-    inpL = ne_view_1d(ctx0, inpL, hidden_size, (embd->ne[0] - 1) * hidden_size * ne_element_size(inpL));
+    inpL = ne_view_1d(ctx0, inpL, n_embd, (embd->ne[0] - 1) * n_embd * ne_element_size(inpL));
   }
   // lm_head
   inpL = ne_mul_mat(ctx0, model.others[2], inpL);
diff --git a/neural_speed/models/gptj/gptj.cpp b/neural_speed/models/gptj/gptj.cpp
index 34a33ad3f..98687fef9 100644
--- a/neural_speed/models/gptj/gptj.cpp
+++ b/neural_speed/models/gptj/gptj.cpp
@@ -146,13 +146,6 @@ static bool gptj_model_eval_internal(model_context* ctx, const model_input* inpu
     memcpy(static_cast<model_token*>(embd->data) + i * N, (inputs + i)->tokens, N * ne_element_size(embd));
   }
 
-#ifdef NE_TP_MODEL
-  if (enable_tp) {
-    // need to broadcast the ids
-    broadcast(p_ctx, reinterpret_cast<float*>(embd->data), N * batch_size * ne_element_size(embd));
-  }
-#endif
-
   struct ne_tensor* inpL = ne_get_rows(ctx0, model.others[0], embd);
 
   for (int il = 0; il < n_layer; ++il) {
diff --git a/neural_speed/models/model_utils/model_files.h b/neural_speed/models/model_utils/model_files.h
index 69fd73ffd..0cb22d9fc 100644
--- a/neural_speed/models/model_utils/model_files.h
+++ b/neural_speed/models/model_utils/model_files.h
@@ -77,7 +77,16 @@ struct model_load_tensor_shard {
   void calc_size() { size = model_calc_tensor_size(ne, type); }
 };
 
-enum model_split_type { SPLIT_NONE, SPLIT_BY_COLUMNS, SPLIT_BY_ROWS, TP_1D_ROW, TP_1D_COLUMN, TP_1D_ONLY_MASTER };
+enum model_split_type {
+  SPLIT_NONE,
+  SPLIT_BY_COLUMNS,
+  SPLIT_BY_ROWS,
+  TP_1D_ROW,
+  TP_1D_COLUMN,
+  TP_1D_ONLY_MASTER,
+  TP_1D_QKV_ROW,
+  TP_1D_QKV_COLUMN
+};
 
 struct model_load_tensor {
   std::vector<model_load_tensor_shard> shards;
@@ -138,6 +147,11 @@ struct model_load_tensor {
           name.find(".attn.k_proj.weight") != std::string::npos ||
           name.find(".attn.v_proj.weight") != std::string::npos ||
           name.find(".mlp.fc_in.weight") != std::string::npos ||
+          // for baichuan
+          name.find(".mlp.gate_proj.weight") != std::string::npos ||
+          name.find(".mlp.up_proj.weight") != std::string::npos ||
+          // for chatglm2
+          name.find(".mlp.dense_h_to_4h.weight") != std::string::npos ||
           // for llama model
           name.find(".attention.wq.weight") != std::string::npos ||
           name.find(".attention.wk.weight") != std::string::npos ||
@@ -146,8 +160,22 @@ struct model_load_tensor {
           name.find(".feed_forward.w3.weight") != std::string::npos) {
         split_type = TP_1D_ROW;
       }
+      if (name.find(".self_attn.W_pack.weight") != std::string::npos ||
+          // for chatglm2
+          name.find(".self_attention.query_key_value.weight") != std::string::npos) {
+        split_type = TP_1D_QKV_ROW;
+      }
+      if (name.find(".self_attention.query_key_value.bias") != std::string::npos) {
+        split_type = TP_1D_QKV_COLUMN;
+      }
       if (name.find(".mlp.fc_in.bias") != std::string::npos || name.find(".mlp.fc_out.weight") != std::string::npos ||
           name.find(".attn.out_proj.weight") != std::string::npos ||
+          name.find(".self_attention.dense.weight") != std::string::npos ||
+          // for baichuan
+          name.find(".self_attn.o_proj.weight") != std::string::npos ||
+          name.find(".mlp.down_proj.weight") != std::string::npos ||
+          // for chatglm2
+          name.find(".mlp.dense_4h_to_h.weight") != std::string::npos ||
           // TODO check if this part should be column
           name.find(".attention.wo.weight") != std::string::npos ||
           name.find(".feed_forward.w2.weight") != std::string::npos) {
@@ -183,11 +211,13 @@ struct model_load_tensor {
         break;
 #ifdef NE_TP_MODEL
       case TP_1D_ROW:
+      case TP_1D_QKV_ROW:
         MODEL_ASSERT(first_shard.ne.size() > 1);
         MODEL_ASSERT(first_shard.ne[1] % world_size == 0);
         ne = {first_shard.ne[0], first_shard.ne[1] / world_size};
         break;
       case TP_1D_COLUMN:
+      case TP_1D_QKV_COLUMN:
         MODEL_ASSERT(first_shard.ne[0] % world_size == 0);
         if (first_shard.ne.size() == 1) {
           ne = {first_shard.ne[0] / world_size};
@@ -542,11 +572,17 @@ struct model_model_loader {
     }
     model_load_tensor& lt = tensors_map.tensors.at(it->second);
 #ifdef NE_TP_MODEL
-    if (lt.enable_tp && (lt.split_type == TP_1D_ROW || lt.split_type == TP_1D_COLUMN)) {
+    if (lt.enable_tp && (lt.split_type == TP_1D_ROW || lt.split_type == TP_1D_COLUMN ||
+                         lt.split_type == TP_1D_QKV_ROW || lt.split_type == TP_1D_QKV_COLUMN)) {
       // check the split dim
-      size_t split_dim_size =
-          lt.ne.size() == 1 ? lt.ne.at(0) : (lt.split_type == TP_1D_ROW ? lt.ne.at(1) : lt.ne.at(0));
-      size_t origin_dim_size = ne.size() == 1 ? ne.at(0) : (lt.split_type == TP_1D_ROW ? ne.at(1) : ne.at(0));
+      size_t split_dim_size, origin_dim_size;
+      if (lt.split_type == TP_1D_ROW || lt.split_type == TP_1D_QKV_ROW) {
+        split_dim_size = lt.ne.size() == 1 ? lt.ne.at(0) : lt.ne.at(1);
+        origin_dim_size = ne.size() == 1 ? ne.at(0) : ne.at(1);
+      } else {
+        split_dim_size = lt.ne.at(0);
+        origin_dim_size = ne.at(0);
+      }
       MODEL_ASSERT(split_dim_size == origin_dim_size / lt.world_size);
       return get_tensor_for(lt, backend);
     }
@@ -627,16 +663,30 @@ struct model_model_loader {
     }
   }
 
-  void bestla_split_weight(void** src, void** dst, size_t src_n, size_t src_k, size_t dst_n, size_t dst_k,
-                           size_t n_rank, size_t k_rank) {
+  void bestla_split_weight(void** src, void** dst, size_t src_n, size_t src_k, size_t dst_n, size_t dst_k, size_t n_rank,
+                          size_t k_rank, bool qkv_fusion = false) {
     auto src_fp32 = (float*)malloc(src_n * src_k * sizeof(float));
     if (src_fp32 == nullptr) {
       assert(0);
     }
     bestla_unpackweight_fp32(*src, src_n, src_k, src_fp32, src_n);
     // layout will be K * N in the buffer
-    auto dst_fp32 = src_fp32 + k_rank * dst_k * src_n + n_rank * dst_n;
-    bestla_packweight_copyattr(dst_fp32, *dst, dst_n, dst_k, src_n, *src);
+    float* dst_fp32;
+    if (qkv_fusion) {
+      dst_fp32 = (float*)malloc(dst_n * dst_k * sizeof(float));
+      for (int i = 0; i < src_k; ++i) {
+        for (int j = 0; j < 3; ++j) {
+          float* dst_off = dst_fp32 + dst_n * i + j * dst_n / 3;
+          float* src_off = src_fp32 + src_n * i + j * src_n / 3 + n_rank * dst_n / 3;
+          memcpy(dst_off, src_off, dst_n * sizeof(float) / 3);
+        }
+      }
+      bestla_packweight_copyattr(dst_fp32, *dst, dst_n, dst_k, dst_n, *src);
+      free(dst_fp32);
+    } else {
+      dst_fp32 = src_fp32 + k_rank * dst_k * src_n + n_rank * dst_n;
+      bestla_packweight_copyattr(dst_fp32, *dst, dst_n, dst_k, src_n, *src);
+    }
     free(src_fp32);
   }
   void load_data_for(model_load_tensor& lt) {
@@ -679,7 +729,7 @@ struct model_model_loader {
       MODEL_ASSERT(out_offset == lt.size);
     }
 #ifdef NE_TP_MODEL
-    else if (lt.split_type == TP_1D_ROW) {
+    else if (lt.split_type == TP_1D_ROW || lt.split_type == TP_1D_QKV_ROW) {
       model_load_tensor_shard& shard = lt.shards.at(0);
       model_buffer tmp_buf;
       model_file& file = file_loaders.at(shard.file_idx)->file;
@@ -691,14 +741,22 @@ struct model_model_loader {
         void* dst_data = (void*)lt.data;
         void* src_data = (void*)(tmp_buf.addr);
         bestla_split_weight(&src_data, &dst_data, lt.world_size * num_rows, lt.ne.at(0), num_rows, lt.ne.at(0), lt.rank,
-                            0);
+                           0, lt.split_type == TP_1D_QKV_ROW);
       } else {
         // only copy part of weight form the tmp_buf of origin file
         tmp_buf.resize(lt.size * lt.world_size);
         file.read_raw(tmp_buf.addr, lt.size * lt.world_size);
-        memcpy(lt.data, tmp_buf.addr + lt.rank * lt.size, lt.size);
+        if (lt.split_type == TP_1D_QKV_ROW) {
+          for (int j = 0; j < 3; ++j) {
+            auto dst_off = lt.data + j * lt.size / 3;
+            auto src_off = tmp_buf.addr + (lt.rank + j * lt.world_size) * lt.size / 3;
+            memcpy(dst_off, src_off, lt.size / 3);
+          }
+        } else {
+          memcpy(lt.data, tmp_buf.addr + lt.rank * lt.size, lt.size);
+        }
       }
-    } else if (lt.split_type == TP_1D_COLUMN) {
+    } else if (lt.split_type == TP_1D_COLUMN || lt.split_type == TP_1D_QKV_COLUMN) {
       if (lt.size == 0) {
         return;
       }
@@ -712,18 +770,30 @@ struct model_model_loader {
         file.read_raw(tmp_buf.addr, shard.size);
         void* dst_data = (void*)lt.data;
         void* src_data = (void*)(tmp_buf.addr);
+        // TODO support QKV COLUMN in bestla
         bestla_split_weight(&src_data, &dst_data, num_rows, lt.world_size * lt.ne.at(0), num_rows, lt.ne.at(0), 0,
-                            lt.rank);
+                           lt.rank);
       } else {
         tmp_buf.resize(lt.size * lt.world_size);
         file.read_raw(tmp_buf.addr, lt.size * lt.world_size);
         size_t offset = 0;
         // different data type may have differnet per_row_size
         size_t per_row_size = lt.size / num_rows;
-        for (size_t i = 0; i < num_rows; ++i) {
-          memcpy(lt.data + offset, tmp_buf.addr + lt.rank * per_row_size + i * lt.world_size * per_row_size,
-                 per_row_size);
-          offset += per_row_size;
+        if (lt.split_type == TP_1D_QKV_COLUMN) {
+          for (size_t i = 0; i < num_rows; ++i) {
+            for (int j = 0; j < 3; ++j) {
+              auto dst_off = lt.data + i * per_row_size + j * per_row_size / 3;
+              auto src_off = tmp_buf.addr + (lt.rank / 3 + j * lt.world_size / 3 + i * lt.world_size) * per_row_size;
+              memcpy(dst_off, src_off, per_row_size / 3);
+            }
+            offset += per_row_size;
+          }
+        } else {
+          for (size_t i = 0; i < num_rows; ++i) {
+            memcpy(lt.data + offset, tmp_buf.addr + lt.rank * per_row_size + i * lt.world_size * per_row_size,
+                   per_row_size);
+            offset += per_row_size;
+          }
         }
         MODEL_ASSERT(offset == lt.size);
       }
diff --git a/neural_speed/models/model_utils/model_utils.cpp b/neural_speed/models/model_utils/model_utils.cpp
index af9480152..d510fad0c 100644
--- a/neural_speed/models/model_utils/model_utils.cpp
+++ b/neural_speed/models/model_utils/model_utils.cpp
@@ -68,21 +68,20 @@ static bool kv_cache_init(const struct model_hparams& hparams, struct model_kv_c
                           const ne_type wtype, const int n_ctx, const int batch_size, const int beam_size,
                           const bool shift_roped_k, model_struct* model) {
   const auto n_layer = hparams.n_layer;
-  const auto heads_kv = hparams.n_head_kv > 0 ? hparams.n_head_kv : hparams.n_head;
+  auto heads_kv = hparams.n_head_kv > 0 ? hparams.n_head_kv : hparams.n_head;
   const auto head_size = hparams.n_embd / hparams.n_head;
+#ifdef NE_TP_MODEL
+  // when use TP, cached kv will also have smaller size
+  parallel_context* p_ctx = init_parallel_context();
+  int32_t world_size = get_tp_size(p_ctx);
+  heads_kv /= world_size;
+#endif
   int32_t k_size, v_size;
   get_batch_kv_elements_from_gpt_params(heads_kv, head_size, n_ctx, wtype, &k_size, &v_size);
 
   int64_t layer_ne_k = batch_size * beam_size * k_size;
   int64_t layer_ne_v = batch_size * beam_size * v_size;
   const auto wsize = wtype == NE_TYPE_BTLA ? 1 : ne_type_size(wtype);
-#ifdef NE_TP_MODEL
-  // when use TP, cached kv will also have smaller size
-  parallel_context* p_ctx = init_parallel_context();
-  int32_t world_size = get_tp_size(p_ctx);
-  layer_ne_k /= world_size;
-  layer_ne_v /= world_size;
-#endif
 
   cache.buf.resize(n_layer * (layer_ne_k + layer_ne_v) * wsize + 2u * MB);
   cache.seq_cells.resize(batch_size * beam_size);
@@ -2070,8 +2069,14 @@ static void bestla_model_kv_cache_seq_cpy(struct model_context* ctx, const model
                                           const model_seq_id& seq_id_dst, const model_pos& p0, const model_pos& p1) {
   const auto& kv_self = ctx->model.kv_self;
   const auto& hparams = ctx->model.hparams;
-  const int heads_kv = hparams.multi_query_group_num > 0 ? hparams.multi_query_group_num : hparams.n_head;
+  int heads_kv = hparams.multi_query_group_num > 0 ? hparams.multi_query_group_num : hparams.n_head;
   const int head_size = hparams.n_embd / hparams.n_head;
+#ifdef NE_TP_MODEL
+  // when use TP, cached kv will also have smaller size
+  parallel_context* p_ctx = init_parallel_context();
+  int32_t world_size = get_tp_size(p_ctx);
+  heads_kv /= world_size;
+#endif
   const int n_ctx = ctx->n_ctx;
   const auto kv_n_ctx_block = ctx->kv_n_ctx_block;
   NE_ASSERT(("Invalid end position!", n_ctx >= p1));