|
| 1 | +" Requires an already running llama.cpp server |
| 2 | +" To install either copy or symlink to ~/.vim/autoload/llama.vim |
| 3 | +" Then start with either :call llama#doLlamaGen(), |
| 4 | +" or add a keybind to your vimrc such as |
| 5 | +" nnoremap Z :call llama#doLlamaGen()<CR> |
| 6 | +" Similarly, you could add an insert mode keybind with |
| 7 | +" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR> |
| 8 | +" |
| 9 | +" g:llama_api_url and g:llama_overrides can be configured in your .vimrc |
| 10 | +" let g:llama_api_url = "192.168.1.10:8080" |
| 11 | +" llama_overrides can also be set through buffer/window scopes. For instance |
| 12 | +" autocmd filetype python let b:llama_overrides = {"temp": 0.2} |
| 13 | +" Could be added to your .vimrc to automatically set a lower temperature when |
| 14 | +" editing a python script |
| 15 | +" Additionally, an override dict can be stored at the top of a file |
| 16 | +" !*{"stop": ["User:"]} |
| 17 | +" Could be added to the start of your chatlog.txt to set the stopping token |
| 18 | +" These parameter dicts are merged together from lowest to highest priority: |
| 19 | +" server default -> g:llama_overrides -> w:llama_overrides -> |
| 20 | +" b:llama_overrides -> in file (!*) overrides |
| 21 | +" |
| 22 | +" Sublists (like logit_bias and stop) are overridden, not merged |
| 23 | +" Example override: |
| 24 | +" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647} |
| 25 | +if !exists("g:llama_api_url") |
| 26 | + let g:llama_api_url= "127.0.0.1:8080" |
| 27 | +endif |
| 28 | +if !exists("g:llama_overrides") |
| 29 | + let g:llama_overrides = {} |
| 30 | +endif |
| 31 | +const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true } |
| 32 | +const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"] |
| 33 | +let s:linedict = {} |
| 34 | + |
| 35 | +func s:callbackHandler(bufn, channel, msg) |
| 36 | + if len(a:msg) < 3 |
| 37 | + return |
| 38 | + elseif a:msg[0] == "d" |
| 39 | + let l:msg = a:msg[6:-1] |
| 40 | + else |
| 41 | + let l:msg = a:msg |
| 42 | + endif |
| 43 | + let l:decoded_msg = json_decode(l:msg) |
| 44 | + let l:newtext = split(l:decoded_msg['content'], "\n", 1) |
| 45 | + if len(l:newtext) > 0 |
| 46 | + call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0]) |
| 47 | + else |
| 48 | + echo "nothing genned" |
| 49 | + endif |
| 50 | + if len(newtext) > 1 |
| 51 | + let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1]) |
| 52 | + let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1 |
| 53 | + endif |
| 54 | + if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop |
| 55 | + echo "Finished generation" |
| 56 | + endif |
| 57 | +endfunction |
| 58 | + |
| 59 | +func llama#doLlamaGen() |
| 60 | + if exists("b:job") |
| 61 | + if job_status(b:job) == "run" |
| 62 | + call job_stop(b:job) |
| 63 | + return |
| 64 | + endif |
| 65 | + endif |
| 66 | + |
| 67 | + let l:cbuffer = bufnr("%") |
| 68 | + let s:linedict[l:cbuffer] = line('$') |
| 69 | + let l:buflines = getbufline(l:cbuffer, 1, 1000) |
| 70 | + let l:querydata = copy(s:querydata) |
| 71 | + call extend(l:querydata, g:llama_overrides) |
| 72 | + if exists("w:llama_overrides") |
| 73 | + call extend(l:querydata, w:llama_overrides) |
| 74 | + endif |
| 75 | + if exists("b:llama_overrides") |
| 76 | + call extend(l:querydata, b:llama_overrides) |
| 77 | + endif |
| 78 | + if l:buflines[0][0:1] == '!*' |
| 79 | + let l:userdata = json_decode(l:buflines[0][2:-1]) |
| 80 | + call extend(l:querydata, l:userdata) |
| 81 | + let l:buflines = l:buflines[1:-1] |
| 82 | + endif |
| 83 | + let l:querydata.prompt = join(l:buflines, "\n") |
| 84 | + let l:curlcommand = copy(s:curlcommand) |
| 85 | + let l:curlcommand[2] = json_encode(l:querydata) |
| 86 | + let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])}) |
| 87 | +endfunction |
| 88 | + |
| 89 | +" Echos the tokkenization of the provided string , or cursor to end of word |
| 90 | +" Onus is placed on the user to include the preceding space |
| 91 | +func llama#tokenizeWord(...) |
| 92 | + if (a:0 > 0) |
| 93 | + let l:input = a:1 |
| 94 | + else |
| 95 | + exe "normal \"*ye" |
| 96 | + let l:input = @* |
| 97 | + endif |
| 98 | + let l:querydata = {"content": l:input} |
| 99 | + let l:curlcommand = copy(s:curlcommand) |
| 100 | + let l:curlcommand[2] = json_encode(l:querydata) |
| 101 | + let l:curlcommand[8] = g:llama_api_url .. "/tokenize" |
| 102 | + let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])}) |
| 103 | +endfunction |
| 104 | + |
| 105 | +func s:tokenizeWordCallback(plaintext, channel, msg) |
| 106 | + echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens) |
| 107 | +endfunction |
| 108 | + |
| 109 | + |
| 110 | +" Echos the token count of the entire buffer (or provided string) |
| 111 | +" Example usage :echo llama#tokenCount() |
| 112 | +func llama#tokenCount(...) |
| 113 | + if (a:0 > 0) |
| 114 | + let l:buflines = a:1 |
| 115 | + else |
| 116 | + let l:buflines = getline(1,1000) |
| 117 | + if l:buflines[0][0:1] == '!*' |
| 118 | + let l:buflines = l:buflines[1:-1] |
| 119 | + endif |
| 120 | + let l:buflines = join(l:buflines, "\n") |
| 121 | + endif |
| 122 | + let l:querydata = {"content": l:buflines} |
| 123 | + let l:curlcommand = copy(s:curlcommand) |
| 124 | + let l:curlcommand[2] = json_encode(l:querydata) |
| 125 | + let l:curlcommand[8] = g:llama_api_url .. "/tokenize" |
| 126 | + let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"}) |
| 127 | +endfunction |
| 128 | + |
| 129 | +func s:tokenCountCallback(channel, msg) |
| 130 | + let resp = json_decode(a:msg) |
| 131 | + echo len(resp.tokens) |
| 132 | +endfunction |
0 commit comments