refactor code + remove unused comments + improved README.md

FSSRepo · FSSRepo · commit 7e64bfe06036 · 2023-10-14T00:31:34.000-04:00
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -26,6 +26,8 @@ Command line options:
 -   `--embedding`: Enable embedding extraction, Default: disabled.
 -   `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
 -   `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
+-   `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+-
 
 ## Build
 
@@ -164,7 +166,7 @@ node index.js
 
     `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
 
-    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications.
+    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 
 -   **POST** `/tokenize`: Tokenize a given text.
 
@@ -196,8 +198,32 @@ node index.js
 
     It also accepts all the options of `/completion` except `stream` and `prompt`.
 
+-   **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
+
 ## More examples
 
+### Change system prompt on runtime
+
+To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
+
+`prompt`: Specify a context that you want all connecting clients to respect.
+
+`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
+
+`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
+
+```json
+{
+    "system_prompt": {
+        "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
+        "anti_prompt": "User:",
+        "assistant_name": "Assistant:"
+    }
+}
+```
+
+**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
+
 ### Interactive mode
 
 Check the sample in [chat.mjs](chat.mjs).
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -3,7 +3,7 @@
 #include "build-info.h"
 #include "grammar-parser.h"
 
-#define SERVER_MULTIMODAL_SUPPORT
+// #define SERVER_MULTIMODAL_SUPPORT
 
 #ifdef SERVER_MULTIMODAL_SUPPORT
 #include "../llava/clip.h"
@@ -44,11 +44,6 @@ struct server_params
     int32_t write_timeout = 600;
 };
 
-// struct beam_search_callback_data {
-//     llama_server_context* ctx;
-//     llama_client_slot* slot;
-// };
-
 static bool server_verbose = false;
 
 #if SERVER_VERBOSE != 1
@@ -660,6 +655,7 @@ struct llama_server_context
         }
         waitAllAreIdle();
         all_slots_are_idle = true;
+
         // wait until system prompt load
         update_system_prompt = true;
         while(update_system_prompt) {
@@ -672,7 +668,11 @@ struct llama_server_context
         system_prompt = sys_props.value("prompt", "");
         user_name = sys_props.value("anti_prompt", "");
         assistant_name = sys_props.value("assistant_name", "");
-        notifySystemPromptChanged();
+        if(slots.size() > 0) {
+            notifySystemPromptChanged();
+        } else {
+            update_system_prompt = true;
+        }
     }
 
     void waitAllAreIdle() {
@@ -813,6 +813,7 @@ struct llama_server_context
                                   });
         return has_next_token; // continue
     }
+
 #ifdef SERVER_MULTIMODAL_SUPPORT
     bool processImages(llama_client_slot &slot) {
         for(slot_image &img : slot.images) {
@@ -1204,6 +1205,11 @@ struct llama_server_context
     }
 };
 
+struct server_beam_search_callback_data {
+    llama_context * ctx;
+    llama_client_slot * slot;
+};
+
 static void server_print_usage(const char *argv0, const gpt_params &params,
                                const server_params &sparams)
 {
@@ -1251,14 +1257,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
     printf("  -np N, --parallel N   number of slots for process requests (default: %d)\n", params.n_parallel);
     printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("    -spf FNAME, --system-prompt-file FNAME\n");
+    printf("                        Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
 #ifdef SERVER_MULTIMODAL_SUPPORT
     printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA.\n");
 #endif
     printf("\n");
 }
 
 static void server_params_parse(int argc, char **argv, server_params &sparams,
-                                gpt_params &params)
+                                gpt_params &params, llama_server_context& llama)
 {
     gpt_params default_params;
     server_params default_sparams;
@@ -1523,6 +1531,26 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             params.n_predict = std::stoi(argv[i]);
+        } else if (arg == "-spf" || arg == "--system-prompt-file")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            std::string systm_content = "";
+            std::copy(
+                std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(systm_content)
+            );
+            llama.processSystemPromptData(json::parse(systm_content));
         }
 #ifdef SERVER_MULTIMODAL_SUPPORT
         else if(arg == "--mmproj") {
@@ -1864,8 +1892,8 @@ static void log_server_request(const Request &req, const Response &res)
                            });
 }
 
-static bool is_at_eob(llama_server_context * server_context, const llama_token *tokens, const size_t n_tokens) {
-    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context->ctx);
+static bool is_at_eob(const server_beam_search_callback_data & server_context, const llama_token *tokens, const size_t n_tokens) {
+    return n_tokens && tokens[n_tokens - 1] == llama_token_eos(server_context.ctx);
 }
 
 // Function matching type llama_beam_search_callback_fn_t.
@@ -1875,34 +1903,34 @@ static bool is_at_eob(llama_server_context * server_context, const llama_token *
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
 
-// AVOID HEADACHES unnecessaries
-
-// static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
-//     auto & llama = *static_cast<beam_search_callback_data*>(callback_data);
-//     // Mark beams as EOS as needed.
-//     for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
-//         llama_beam_view& beam_view = beams_state.beam_views[i];
-//         if (!beam_view.eob && is_at_eob(llama.ctx, beam_view.tokens, beam_view.n_tokens)) {
-//             beam_view.eob = true;
-//         }
-//     }
-//     printf(",");  // Show progress
-//     if (const size_t n = beams_state.common_prefix_length) {
-//         llama.slot->generated_token_probs.resize(llama.slot->generated_token_probs.size() + n);
-//         assert(0u < beams_state.n_beams);
-//         const llama_token * tokens = beams_state.beam_views[0].tokens;
-//         const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
-//         std::transform(tokens, tokens + n, llama.slot->generated_token_probs.end() - n, map);
-//         printf("%zu", n);
-//     }
-//     fflush(stdout);
-// #if 0 // DEBUG: print current beams for this iteration
-//     std::cout << "\n\nCurrent beams:\n";
-//     for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
-//         std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
-//     }
-// #endif
-// }
+// NO TESTED after PR #3589
+
+static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
+    auto & llama = *static_cast<server_beam_search_callback_data*>(callback_data);
+    // Mark beams as EOS as needed.
+    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
+        llama_beam_view& beam_view = beams_state.beam_views[i];
+        if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
+            beam_view.eob = true;
+        }
+    }
+    printf(",");  // Show progress
+    if (const size_t n = beams_state.common_prefix_length) {
+        llama.slot->generated_token_probs.resize(llama.slot->generated_token_probs.size() + n);
+        assert(0u < beams_state.n_beams);
+        const llama_token * tokens = beams_state.beam_views[0].tokens;
+        const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
+        std::transform(tokens, tokens + n, llama.slot->generated_token_probs.end() - n, map);
+        printf("%zu", n);
+    }
+    fflush(stdout);
+#if 0 // DEBUG: print current beams for this iteration
+    std::cout << "\n\nCurrent beams:\n";
+    for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
+        std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
+    }
+#endif
+}
 
 struct token_translator {
     llama_context * ctx;
@@ -1933,7 +1961,7 @@ int main(int argc, char **argv)
     // struct that contains llama context and inference
     llama_server_context llama;
 
-    server_params_parse(argc, argv, sparams, params);
+    server_params_parse(argc, argv, sparams, params, llama);
 
     if (params.model_alias == "unknown")
     {
@@ -2015,8 +2043,6 @@ int main(int argc, char **argv)
             llama.processSystemPromptData(data["system_prompt"]);
         }
 
-        // llama_reset_timings(llama.ctx);
-
         slot->reset();
 
         parse_options_completion(data, slot, llama);
@@ -2030,14 +2056,14 @@ int main(int argc, char **argv)
         if (!slot->params.stream) {
             std::string completion_text = "";
             if (llama.params.n_beams) {
-                // // Fill llama.generated_token_probs vector with final beam.
-                // beam_search_callback_data data_;
-                // data_.slot = slot;
-                // data_.ctx = &llama;
-                // llama_beam_search(llama.ctx, beam_search_callback, &data_, llama.params.n_beams,
-                //                      slot->n_past, llama.params.n_predict);
-                // // Translate llama.generated_token_probs to llama.generated_text.
-                // append_to_generated_text_from_generated_token_probs(llama, slot);
+                // Fill llama.generated_token_probs vector with final beam.
+                server_beam_search_callback_data data_beam;
+                data_beam.slot = slot;
+                data_beam.ctx = llama.ctx;
+                llama_beam_search(llama.ctx, beam_search_callback, &data_beam, llama.params.n_beams,
+                                     slot->n_past, llama.params.n_predict);
+                // Translate llama.generated_token_probs to llama.generated_text.
+                append_to_generated_text_from_generated_token_probs(llama, slot);
             } else {
                 while (slot->isProcessing()) {
                     if(slot->hasNewToken()) {
@@ -2055,8 +2081,6 @@ int main(int argc, char **argv)
             }
 
             const json data = format_final_response(llama, slot, completion_text, probs);
-
-            //llama_print_timings(llama.ctx);
             slot->release();
             res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
                             "application/json");
@@ -2138,8 +2162,6 @@ int main(int argc, char **argv)
             llama.processSystemPromptData(data["system_prompt"]);
         }
 
-        // llama_reset_timings(llama.ctx);
-
         slot->reset();
         slot->infill = true;
 
@@ -2167,7 +2189,6 @@ int main(int argc, char **argv)
             }
 
             const json data = format_final_response(llama, slot, completion_text, probs);
-            //llama_print_timings(llama.ctx);
             res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
                             "application/json");
         } else {