Skip to content

Instantly share code, notes, and snippets.

@ChenYFan
Created April 29, 2026 06:07
Show Gist options
  • Select an option

  • Save ChenYFan/ee5b0441e857b09135c3b2269c88f3a6 to your computer and use it in GitHub Desktop.

Select an option

Save ChenYFan/ee5b0441e857b09135c3b2269c88f3a6 to your computer and use it in GitHub Desktop.
Nyirusu与llama.cpp的桥接程序
/*
* nry_provider_bridge.c — Unified C bridge for NyirusuCore LlamaCpp provider
*
* Separates model/context/mmproj/sampler lifecycle for flexible JS-side management.
* All four handles are independent and can be created/freed separately.
*
* Future HTTP API providers will not use this bridge.
* For non-backpressure providers, pipeline layer simulates by:
* stop current generation → modify context → new generation, UUID stays the same.
*/
#include "llama.h"
#include "ggml.h"
#include "mtmd.h"
#include "mtmd-helper.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
/* ========== Handle types ========== */
typedef struct {
struct llama_model *model;
const struct llama_vocab *vocab;
int32_t n_embd;
} nry_model;
typedef struct {
struct llama_context *ctx;
llama_memory_t mem;
nry_model *model; /* non-owning back-reference */
llama_pos n_past;
} nry_context;
typedef struct {
struct llama_sampler *chain;
} nry_sampler;
typedef struct {
mtmd_context *mctx;
} nry_mmproj;
/* ========== Backend (call once) ========== */
void nry_backend_init(void) {
llama_backend_init();
fprintf(stderr, "[nry] backend initialized\n");
}
void nry_backend_free(void) {
llama_backend_free();
fprintf(stderr, "[nry] backend freed\n");
}
/* ========== Model ========== */
nry_model *nry_model_load(const char *path, int n_gpu_layers, int no_mmap, int vocab_only) {
nry_model *m = (nry_model *)calloc(1, sizeof(nry_model));
if (!m) return NULL;
struct llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers;
mparams.use_mmap = !no_mmap;
mparams.vocab_only = vocab_only ? true : false;
fprintf(stderr, "[nry] loading model: %s (gpu_layers=%d, vocab_only=%d)\n", path, n_gpu_layers, vocab_only);
m->model = llama_model_load_from_file(path, mparams);
if (!m->model) {
fprintf(stderr, "[nry] failed to load model\n");
free(m);
return NULL;
}
m->vocab = llama_model_get_vocab(m->model);
m->n_embd = llama_model_n_embd_inp(m->model);
fprintf(stderr, "[nry] model loaded, n_embd=%d\n", m->n_embd);
return m;
}
void nry_model_free(nry_model *m) {
if (!m) return;
if (m->model) llama_model_free(m->model);
free(m);
fprintf(stderr, "[nry] model freed\n");
}
int32_t nry_model_n_embd(nry_model *m) {
return m ? m->n_embd : 0;
}
int32_t nry_model_n_embd_out(nry_model *m) {
return m ? llama_model_n_embd_out(m->model) : 0;
}
/* ========== Context ========== */
/* kv_type: 0=F16, 1=Q8_0, 2=Q4_0 */
static enum ggml_type kv_type_map(int kv_type) {
switch (kv_type) {
case 1: return GGML_TYPE_Q8_0;
case 2: return GGML_TYPE_Q4_0;
default: return GGML_TYPE_F16;
}
}
nry_context *nry_context_create(nry_model *m, int n_ctx, int flash_attn,
int kv_type, int n_batch, int n_ubatch) {
if (!m || !m->model) return NULL;
nry_context *c = (nry_context *)calloc(1, sizeof(nry_context));
if (!c) return NULL;
struct llama_context_params cparams = llama_context_default_params();
cparams.no_perf = false;
cparams.n_ctx = n_ctx;
cparams.n_batch = n_batch > 0 ? n_batch : 512;
cparams.n_ubatch = n_ubatch > 0 ? n_ubatch : 512;
cparams.n_seq_max = 1;
cparams.type_k = kv_type_map(kv_type);
cparams.type_v = kv_type_map(kv_type);
cparams.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
cparams.offload_kqv = 1;
c->ctx = llama_init_from_model(m->model, cparams);
if (!c->ctx) {
fprintf(stderr, "[nry] failed to create context\n");
free(c);
return NULL;
}
c->mem = llama_get_memory(c->ctx);
c->model = m;
c->n_past = 0;
fprintf(stderr, "[nry] context created, n_ctx=%d, kv_type=%d, flash_attn=%d\n",
n_ctx, kv_type, flash_attn);
return c;
}
/* pooling_type: 0=NONE, 1=MEAN, 2=CLS, 3=LAST */
nry_context *nry_context_create_embedding(nry_model *m, int n_ctx, int flash_attn,
int kv_type, int n_batch, int n_ubatch,
int pooling_type) {
if (!m || !m->model) return NULL;
nry_context *c = (nry_context *)calloc(1, sizeof(nry_context));
if (!c) return NULL;
struct llama_context_params cparams = llama_context_default_params();
cparams.n_ctx = n_ctx > 0 ? n_ctx : 512;
cparams.n_batch = n_batch > 0 ? n_batch : 512;
cparams.n_ubatch = n_ubatch > 0 ? n_ubatch : 512;
cparams.n_seq_max = 1;
cparams.type_k = kv_type_map(kv_type);
cparams.type_v = kv_type_map(kv_type);
cparams.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
cparams.offload_kqv = 1;
cparams.embeddings = true;
cparams.pooling_type = (enum llama_pooling_type)pooling_type;
c->ctx = llama_init_from_model(m->model, cparams);
if (!c->ctx) {
fprintf(stderr, "[nry] failed to create embedding context\n");
free(c);
return NULL;
}
c->mem = llama_get_memory(c->ctx);
c->model = m;
c->n_past = 0;
fprintf(stderr, "[nry] embedding context created, n_ctx=%d, pooling_type=%d\n",
cparams.n_ctx, pooling_type);
return c;
}
void nry_set_embeddings(nry_context *c, int enabled) {
if (c && c->ctx) llama_set_embeddings(c->ctx, enabled != 0);
}
void nry_context_clear(nry_context *c) {
if (!c) return;
llama_memory_clear(c->mem, true);
c->n_past = 0;
}
void nry_context_free(nry_context *c) {
if (!c) return;
if (c->ctx) llama_free(c->ctx);
free(c);
fprintf(stderr, "[nry] context freed\n");
}
int nry_context_n_past(nry_context *c) {
return c ? (int)c->n_past : 0;
}
/* ========== Sampler ========== */
nry_sampler *nry_sampler_create(float temp, float top_p, float min_p, uint32_t seed) {
nry_sampler *s = (nry_sampler *)calloc(1, sizeof(nry_sampler));
if (!s) return NULL;
struct llama_sampler_chain_params sparams = llama_sampler_chain_default_params();
s->chain = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(s->chain, llama_sampler_init_penalties(64, 1.1f, 0.0f, 0.0f));
llama_sampler_chain_add(s->chain, llama_sampler_init_temp(temp));
llama_sampler_chain_add(s->chain, llama_sampler_init_top_p(top_p, 1));
if (min_p > 0.0f) {
llama_sampler_chain_add(s->chain, llama_sampler_init_min_p(min_p, 1));
}
llama_sampler_chain_add(s->chain, llama_sampler_init_dist(seed));
fprintf(stderr, "[nry] sampler created (temp=%.2f, top_p=%.2f, min_p=%.2f, seed=%u, repeat_penalty=1.1, last_n=64)\n",
temp, top_p, min_p, seed);
return s;
}
void nry_sampler_reset(nry_sampler *s) {
if (s && s->chain) llama_sampler_reset(s->chain);
}
void nry_sampler_free(nry_sampler *s) {
if (!s) return;
if (s->chain) llama_sampler_free(s->chain);
free(s);
}
void nry_sampler_update(nry_sampler *s, float temp, float top_p, float min_p,
int32_t penalty_last_n, float penalty_repeat,
float penalty_freq, float penalty_present) {
if (!s) return;
if (s->chain) llama_sampler_free(s->chain);
struct llama_sampler_chain_params sparams = llama_sampler_chain_default_params();
s->chain = llama_sampler_chain_init(sparams);
if (penalty_last_n != 0) {
llama_sampler_chain_add(s->chain, llama_sampler_init_penalties(penalty_last_n, penalty_repeat, penalty_freq, penalty_present));
}
llama_sampler_chain_add(s->chain, llama_sampler_init_temp(temp));
llama_sampler_chain_add(s->chain, llama_sampler_init_top_p(top_p, 1));
if (min_p > 0.0f) {
llama_sampler_chain_add(s->chain, llama_sampler_init_min_p(min_p, 1));
}
llama_sampler_chain_add(s->chain, llama_sampler_init_dist(0));
}
/* ========== Perf ========== */
void nry_perf_context_reset(nry_context *c) {
if (c && c->ctx) llama_perf_context_reset(c->ctx);
}
void nry_perf_context_read(nry_context *c,
double *t_p_eval_ms, double *t_eval_ms,
int32_t *n_p_eval, int32_t *n_eval) {
if (!c || !c->ctx) { *t_p_eval_ms = 0; *t_eval_ms = 0; *n_p_eval = 0; *n_eval = 0; return; }
struct llama_perf_context_data d = llama_perf_context(c->ctx);
*t_p_eval_ms = d.t_p_eval_ms;
*t_eval_ms = d.t_eval_ms;
*n_p_eval = d.n_p_eval;
*n_eval = d.n_eval;
}
/* ========== Mmproj ========== */
nry_mmproj *nry_mmproj_load(const char *path, nry_model *m, int use_gpu, int flash_attn) {
if (!m || !m->model) return NULL;
nry_mmproj *mm = (nry_mmproj *)calloc(1, sizeof(nry_mmproj));
if (!mm) return NULL;
struct mtmd_context_params mctx_params = mtmd_context_params_default();
mctx_params.use_gpu = use_gpu ? true : false;
mctx_params.print_timings = true;
mctx_params.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
fprintf(stderr, "[nry] loading mmproj: %s\n", path);
mm->mctx = mtmd_init_from_file(path, m->model, mctx_params);
if (!mm->mctx) {
fprintf(stderr, "[nry] failed to load mmproj\n");
free(mm);
return NULL;
}
fprintf(stderr, "[nry] mmproj loaded\n");
return mm;
}
void nry_mmproj_free(nry_mmproj *mm) {
if (!mm) return;
if (mm->mctx) mtmd_free(mm->mctx);
free(mm);
fprintf(stderr, "[nry] mmproj freed\n");
}
/* ========== Tokenizer (model-level) ========== */
int nry_tokenize(nry_model *m, const char *text, int32_t *out, int max_tokens, int add_special) {
if (!m) return -1;
return llama_tokenize(m->vocab, text, strlen(text), out, max_tokens, add_special, true);
}
int nry_detokenize(nry_model *m, int32_t token, char *buf, int buf_size) {
if (!m) return -1;
return llama_token_to_piece(m->vocab, token, buf, buf_size, 0, true);
}
int nry_is_eog(nry_model *m, int32_t token) {
if (!m) return 1;
return llama_vocab_is_eog(m->vocab, token) ? 1 : 0;
}
/* ========== Decode operations (context-level) ========== */
int nry_decode_tokens(nry_context *c, int32_t *tokens, int n) {
if (!c || !c->ctx) return -1;
struct llama_batch batch = llama_batch_get_one(tokens, n);
int ret = llama_decode(c->ctx, batch);
if (ret == 0) c->n_past += n;
return ret;
}
int nry_decode_one(nry_context *c, int32_t token) {
if (!c || !c->ctx) return -1;
int32_t tokens[1] = { token };
struct llama_batch batch = llama_batch_get_one(tokens, 1);
int ret = llama_decode(c->ctx, batch);
if (ret == 0) c->n_past++;
return ret;
}
int nry_decode_text(nry_context *c, const char *text, int add_special) {
if (!c || !c->model) return -1;
int32_t tokens[8192];
int n = llama_tokenize(c->model->vocab, text, strlen(text), tokens, 8192, add_special, true);
if (n < 0) return -1;
int batch_size = 512;
for (int i = 0; i < n; i += batch_size) {
int n_cur = (n - i < batch_size) ? (n - i) : batch_size;
struct llama_batch batch = llama_batch_get_one(tokens + i, n_cur);
int ret = llama_decode(c->ctx, batch);
if (ret != 0) return ret;
c->n_past += n_cur;
}
return 0;
}
/* Embedding file format: int32 n_tokens + int32 n_embd + int32 n_pos + float[n_tokens*n_embd] */
int nry_decode_embd_file(nry_context *c, const char *path) {
if (!c || !c->model) return -1;
FILE *f = fopen(path, "rb");
if (!f) return -1;
int32_t n_tokens, n_embd, n_pos;
if (fread(&n_tokens, sizeof(int32_t), 1, f) != 1 ||
fread(&n_embd, sizeof(int32_t), 1, f) != 1 ||
fread(&n_pos, sizeof(int32_t), 1, f) != 1) {
fclose(f);
return -2;
}
if (n_embd != c->model->n_embd) {
fprintf(stderr, "[nry] embd dimension mismatch: file=%d, model=%d\n", n_embd, c->model->n_embd);
fclose(f);
return -3;
}
float *embd_data = (float *)malloc(n_tokens * n_embd * sizeof(float));
if (!embd_data) { fclose(f); return -4; }
if (fread(embd_data, sizeof(float), n_tokens * n_embd, f) != (size_t)(n_tokens * n_embd)) {
free(embd_data);
fclose(f);
return -5;
}
fclose(f);
fprintf(stderr, "[nry] injecting embedding: n_tokens=%d, n_embd=%d\n", n_tokens, n_embd);
int batch_size = 512;
for (int i = 0; i < n_tokens; i += batch_size) {
int n_cur = (n_tokens - i < batch_size) ? (n_tokens - i) : batch_size;
struct llama_batch batch = {
.n_tokens = n_cur,
.token = NULL,
.embd = embd_data + i * n_embd,
.pos = NULL,
.n_seq_id = NULL,
.seq_id = NULL,
.logits = NULL,
};
int ret = llama_decode(c->ctx, batch);
if (ret != 0) {
fprintf(stderr, "[nry] embd decode failed at offset %d\n", i);
free(embd_data);
return -6;
}
c->n_past += n_cur;
}
free(embd_data);
fprintf(stderr, "[nry] embedding injected, n_past=%d\n", (int)c->n_past);
return 0;
}
/* ========== Sampling ========== */
int32_t nry_sample(nry_context *c, nry_sampler *s) {
if (!c || !s) return -1;
return llama_sampler_sample(s->chain, c->ctx, -1);
}
/* ========== Text embedding extraction ========== */
int nry_text_embed(nry_model *m, nry_context *c, const char *text,
float *out, int max_embd) {
if (!m || !c || !c->ctx) return -1;
int32_t tokens[8192];
int n = llama_tokenize(m->vocab, text, strlen(text), tokens, 8192, 1, 1);
if (n <= 0) return -2;
llama_memory_clear(c->mem, true);
c->n_past = 0;
struct llama_batch batch = llama_batch_init(n, 0, 1);
for (int i = 0; i < n; i++) {
batch.token[i] = tokens[i];
batch.pos[i] = i;
batch.n_seq_id[i] = 1;
batch.seq_id[i][0] = 0;
batch.logits[i] = 1;
}
batch.n_tokens = n;
int ret = llama_decode(c->ctx, batch);
if (ret != 0) {
fprintf(stderr, "[nry] text embed decode failed: %d\n", ret);
llama_batch_free(batch);
return -3;
}
const float *embd = llama_get_embeddings_seq(c->ctx, 0);
if (!embd) {
embd = llama_get_embeddings_ith(c->ctx, -1);
}
if (!embd) {
fprintf(stderr, "[nry] text embed: no embeddings available\n");
llama_batch_free(batch);
return -4;
}
int n_embd_out = llama_model_n_embd_out(m->model);
int n_copy = (n_embd_out < max_embd) ? n_embd_out : max_embd;
memcpy(out, embd, n_copy * sizeof(float));
llama_batch_free(batch);
c->n_past = n;
fprintf(stderr, "[nry] text embedding extracted, n_tokens=%d, n_embd_out=%d\n",
n, n_embd_out);
return n_embd_out;
}
/* ========== Embedding extraction (mmproj) ========== */
int nry_extract_image(nry_model *m, nry_mmproj *mm, const char *image_path, const char *output_path) {
if (!m || !mm || !mm->mctx) return -1;
mtmd_bitmap *bitmap = mtmd_helper_bitmap_init_from_file(mm->mctx, image_path);
if (!bitmap) {
fprintf(stderr, "[nry] failed to load image: %s\n", image_path);
return -1;
}
const char *marker = mtmd_default_marker();
char prompt[512];
snprintf(prompt, sizeof(prompt), "%s", marker);
mtmd_input_chunks *chunks = mtmd_input_chunks_init();
mtmd_input_text text = { prompt, false, true };
const mtmd_bitmap *bitmaps[] = { bitmap };
int32_t ret = mtmd_tokenize(mm->mctx, chunks, &text, bitmaps, 1);
if (ret != 0) {
mtmd_bitmap_free(bitmap);
mtmd_input_chunks_free(chunks);
return -2;
}
/* find image chunk */
const mtmd_input_chunk *img_chunk = NULL;
size_t n_chunks = mtmd_input_chunks_size(chunks);
for (size_t i = 0; i < n_chunks; i++) {
const mtmd_input_chunk *ch = mtmd_input_chunks_get(chunks, i);
if (mtmd_input_chunk_get_type(ch) == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
img_chunk = ch;
break;
}
}
if (!img_chunk) {
mtmd_bitmap_free(bitmap);
mtmd_input_chunks_free(chunks);
return -3;
}
ret = mtmd_encode_chunk(mm->mctx, img_chunk);
if (ret != 0) {
mtmd_bitmap_free(bitmap);
mtmd_input_chunks_free(chunks);
return -4;
}
int32_t n_tokens = (int32_t)mtmd_input_chunk_get_n_tokens(img_chunk);
int32_t n_embd = llama_model_n_embd_inp(m->model);
int32_t n_pos = (int32_t)mtmd_input_chunk_get_n_pos(img_chunk);
float *embd = mtmd_get_output_embd(mm->mctx);
fprintf(stderr, "[nry] extracted: n_tokens=%d, n_embd=%d, n_pos=%d\n", n_tokens, n_embd, n_pos);
FILE *f = fopen(output_path, "wb");
if (!f) {
mtmd_bitmap_free(bitmap);
mtmd_input_chunks_free(chunks);
return -5;
}
fwrite(&n_tokens, sizeof(int32_t), 1, f);
fwrite(&n_embd, sizeof(int32_t), 1, f);
fwrite(&n_pos, sizeof(int32_t), 1, f);
fwrite(embd, sizeof(float), n_tokens * n_embd, f);
fclose(f);
fprintf(stderr, "[nry] saved embedding to %s (%d bytes)\n",
output_path, (int)(12 + n_tokens * n_embd * sizeof(float)));
mtmd_bitmap_free(bitmap);
mtmd_input_chunks_free(chunks);
return 0;
}
/* ========== Context state save/load ========== */
size_t nry_context_state_size(nry_context *c) {
if (!c || !c->ctx) return 0;
return llama_state_seq_get_size(c->ctx, 0);
}
size_t nry_context_state_save(nry_context *c, uint8_t *buf, size_t buf_size) {
if (!c || !c->ctx) return 0;
return llama_state_seq_get_data(c->ctx, buf, buf_size, 0);
}
size_t nry_context_state_load(nry_context *c, const uint8_t *buf, size_t buf_size) {
if (!c || !c->ctx) return 0;
llama_memory_clear(c->mem, true);
size_t read = llama_state_seq_set_data(c->ctx, buf, buf_size, 0);
/* n_past is not tracked by llama — caller must set it */
return read;
}
void nry_context_set_n_past(nry_context *c, int n_past) {
if (c) c->n_past = n_past;
}
/* ========== Vision eval (context + mmproj) ========== */
int nry_vision_eval(nry_context *c, nry_mmproj *mm, const char *prompt, const char *image_path) {
if (!c || !mm || !mm->mctx) return -1;
mtmd_bitmap *bitmap = mtmd_helper_bitmap_init_from_file(mm->mctx, image_path);
if (!bitmap) {
fprintf(stderr, "[nry] failed to load image: %s\n", image_path);
return -1;
}
mtmd_input_chunks *chunks = mtmd_input_chunks_init();
mtmd_input_text text = { prompt, true, true };
const mtmd_bitmap *bitmaps[] = { bitmap };
int32_t ret = mtmd_tokenize(mm->mctx, chunks, &text, bitmaps, 1);
if (ret != 0) {
mtmd_bitmap_free(bitmap);
mtmd_input_chunks_free(chunks);
return -2;
}
llama_pos new_n_past = 0;
ret = mtmd_helper_eval_chunks(mm->mctx, c->ctx, chunks, c->n_past, 0, 512, true, &new_n_past);
c->n_past = new_n_past;
mtmd_bitmap_free(bitmap);
mtmd_input_chunks_free(chunks);
if (ret != 0) {
fprintf(stderr, "[nry] vision eval failed: %d\n", ret);
return -3;
}
fprintf(stderr, "[nry] vision eval done, n_past=%d\n", (int)c->n_past);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment