Spaces:

DanofficeIT
/

privatellm

Runtime error

lhhj

first

57e3690 4 months ago

7.63 kB

	#pragma once

	#include "ggml.h"
	#include "ggml-backend.h"

	#ifdef __cplusplus
	extern "C" {
	#endif

	// Scheduling priorities
	enum ggml_sched_priority {
	GGML_SCHED_PRIO_NORMAL,
	GGML_SCHED_PRIO_MEDIUM,
	GGML_SCHED_PRIO_HIGH,
	GGML_SCHED_PRIO_REALTIME
	};

	// Threadpool params
	// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
	struct ggml_threadpool_params {
	bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
	int n_threads; // number of threads
	enum ggml_sched_priority prio; // thread priority
	uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
	bool strict_cpu; // strict cpu placement
	bool paused; // start in paused state
	};

	struct ggml_threadpool; // forward declaration, see ggml.c

	typedef struct ggml_threadpool * ggml_threadpool_t;

	// the compute plan that needs to be prepared for ggml_graph_compute()
	// since https://github.com/ggerganov/ggml/issues/287
	struct ggml_cplan {
	size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
	uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

	int n_threads;
	struct ggml_threadpool * threadpool;

	// abort ggml_graph_compute when true
	ggml_abort_callback abort_callback;
	void * abort_callback_data;
	};

	// numa strategies
	enum ggml_numa_strategy {
	GGML_NUMA_STRATEGY_DISABLED = 0,
	GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
	GGML_NUMA_STRATEGY_ISOLATE = 2,
	GGML_NUMA_STRATEGY_NUMACTL = 3,
	GGML_NUMA_STRATEGY_MIRROR = 4,
	GGML_NUMA_STRATEGY_COUNT
	};

	GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
	GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node

	GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
	GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);

	GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
	GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);

	GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
	GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);

	GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
	GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);

	GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
	GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);

	GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
	GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);

	GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
	GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
	GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
	GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
	GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
	GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
	GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
	GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);

	// ggml_graph_plan() has to be called before ggml_graph_compute()
	// when plan.work_size > 0, caller must allocate memory for plan.work_data
	GGML_API struct ggml_cplan ggml_graph_plan(
	const struct ggml_cgraph * cgraph,
	int n_threads, /* = GGML_DEFAULT_N_THREADS */
	struct ggml_threadpool * threadpool /* = NULL */ );
	GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);

	// same as ggml_graph_compute() but the work data is allocated as a part of the context
	// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
	GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);

	// TODO: move to backend interface
	GGML_API int ggml_cpu_has_neon (void);
	GGML_API int ggml_cpu_has_sve (void);
	GGML_API int ggml_cpu_has_matmul_int8(void);
	// get the sve vector length in bytes
	GGML_API int ggml_cpu_get_sve_cnt(void);

	// Internal types and functions exposed for tests and benchmarks

	typedef void (*ggml_from_float_to_mat_t)
	(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
	typedef void (ggml_vec_dot_t) (int n, float GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
	const void * GGML_RESTRICT y, size_t by, int nrc);
	typedef void (ggml_gemv_t) (int n, float GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
	const void * GGML_RESTRICT y, int nr, int nc);
	typedef void (ggml_gemm_t) (int n, float GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
	const void * GGML_RESTRICT y, int nr, int nc);

	struct ggml_type_traits_cpu {
	ggml_from_float_to_mat_t from_float_to_mat;
	ggml_vec_dot_t vec_dot;
	enum ggml_type vec_dot_type;
	int64_t nrows; // number of rows to process simultaneously
	int64_t ncols; // number of columns to process simultaneously
	ggml_gemv_t gemv;
	ggml_gemm_t gemm;
	};

	GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);

	GGML_API void ggml_cpu_init(void);

	//
	// CPU backend
	//

	GGML_API ggml_backend_t ggml_backend_cpu_init(void);

	GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
	GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
	GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
	GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);

	GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);

	#ifdef GGML_USE_CPU_HBM
	GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
	#endif

	#ifdef __cplusplus
	}
	#endif