import subprocess
import matplotlib.pyplot as plt
import re

# Defining the command template
cmd = "C:/DATA/TestLLama/llama/main.exe \
 --seed 147369852 \
 --threads {threads} \
 --n_predict 64 \
 --model ./models/7B/ggml-model-q4_0.bin \
 --top_k 40 \
 --top_p 0.9 \
 --temp 0.5 \
 --repeat_last_n 64 \
 --repeat_penalty 1.1 \
 -p \"Write a funny joke:\""

# Defining the range of threads to loop over
min_threads = 4
max_threads = 32
step = 2

# Defining the number of runs for each thread cmd evaluation
n_runs = 5

# Initializing the lists to store the results
threads_list = []
token_time_list = []

for threads in range(min_threads, max_threads + 1, step):

    print(f"Running with {threads} threads...")

    token_times = []

    for run in range(n_runs):

        result = subprocess.run(cmd.format(threads=threads), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
        output = result.stdout.decode()

        # Extracting the token time using regular expression
        token_time = float(re.search(r"\s+(\d+\.\d+) ms per token", output).group(1))

        print(f"\t {threads} threads | run {run+1}/{n_runs} | current token time {round(token_time, 2)} ms")

        token_times.append(token_time)

    # Get the average token time for the current number of threads
    avg_token_time = sum(token_times) / len(token_times)
    token_time_list.append(avg_token_time)
    threads_list.append(threads)

# Plot the result
plt.plot(threads_list, token_time_list)
plt.xlabel("Number of threads")
plt.ylabel("Token time (ms)")
plt.title("Token time vs Number of threads")
plt.show()