Skip to content
Open
8 changes: 7 additions & 1 deletion .github/workflows/llama_cpp_plugin_build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
pull_request:
paths:
- 'modules/llama_cpp_plugin/**'
- '.github/workflows/llama_cpp_plugin_build_and_test.yml'

permissions: read-all

Expand Down Expand Up @@ -46,6 +47,11 @@ jobs:
needs: build_ubuntu20
runs-on: ubuntu-20.04
steps:
- name: Set up Python 3.9
uses: actions/setup-python@39cd149 # v5.1.1
with:
python-version: "3.9"

- name: Download build artifacts
uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
with:
Expand All @@ -60,7 +66,7 @@ jobs:

- name: Prepare test data - convert test model files
run: |
pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
pip install -r llama.cpp/requirements/requirements-convert_hf_to_gguf.txt
huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
mkdir -p ${{ github.workspace }}/test_data
python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
Expand Down
2 changes: 2 additions & 0 deletions modules/custom_operations/tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ torch
onnx
tensorboard
pytest
# WA CVS-150813
numpy<2.0.0
# open3d==0.16.0 - need to update with new release
6 changes: 3 additions & 3 deletions modules/llama_cpp_plugin/src/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ LlamaCppModel::LlamaCppModel(const std::string& gguf_fname,
: ICompiledModel(nullptr, plugin),
m_gguf_fname(gguf_fname),
m_num_threads(num_threads) {
OPENVINO_DEBUG << "llama_cpp_plugin: loading llama model directly from GGUF... " << std::endl;
OPENVINO_DEBUG("llama_cpp_plugin: loading llama model directly from GGUF... \n");
llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = 99;
m_llama_model_ptr = llama_load_model_from_file(gguf_fname.c_str(), mparams);
OPENVINO_DEBUG << "llama_cpp_plugin: llama model loaded successfully from GGUF..." << std::endl;
OPENVINO_DEBUG("llama_cpp_plugin: llama model loaded successfully from GGUF...\n");

auto input_ids = std::make_shared<ov::opset13::Parameter>(ov::element::Type_t::i64, ov::PartialShape({-1, -1}));
auto fake_convert = std::make_shared<ov::opset13::Convert>(input_ids->output(0), ov::element::Type_t::f32);
Expand Down Expand Up @@ -71,7 +71,7 @@ std::shared_ptr<const ov::Model> LlamaCppModel::get_runtime_model() const {
}

void LlamaCppModel::set_property(const ov::AnyMap& properties) {
OPENVINO_DEBUG << "llama_cpp_plugin: attempted to set_property (did nothing)";
OPENVINO_DEBUG("llama_cpp_plugin: attempted to set_property (did nothing)");
}

ov::Any LlamaCppModel::get_property(const std::string& name) const {
Expand Down
8 changes: 4 additions & 4 deletions modules/llama_cpp_plugin/src/infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void allocate_tensor_impl(ov::SoPtr<ov::ITensor>& tensor,
LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model,
size_t num_threads)
: ov::ISyncInferRequest(compiled_model) {
OPENVINO_DEBUG << "llama_cpp_plugin: infer request ctor called\n";
OPENVINO_DEBUG("llama_cpp_plugin: infer request ctor called\n");
llama_context_params cparams = llama_context_default_params();
cparams.n_threads = num_threads ? num_threads : std::thread::hardware_concurrency();
cparams.n_ctx = 0; // this means that the actual n_ctx will be taken equal to the model's train-time value
Expand All @@ -51,7 +51,7 @@ LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr<const L
}
void LlamaCppSyncInferRequest::set_tensors_impl(const ov::Output<const ov::Node> port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) {
OPENVINO_DEBUG << "llama_cpp_plugin: set_tensors_impl called\n";
OPENVINO_DEBUG("llama_cpp_plugin: set_tensors_impl called\n");
}

void llama_batch_add_reimpl(struct llama_batch& batch,
Expand Down Expand Up @@ -131,12 +131,12 @@ void LlamaCppSyncInferRequest::infer() {
llama_batch_free(batch);
};
std::vector<ov::ProfilingInfo> LlamaCppSyncInferRequest::get_profiling_info() const {
OPENVINO_DEBUG << "llama_cpp_plugin: get_profiling_info() called\n";
OPENVINO_DEBUG("llama_cpp_plugin: get_profiling_info() called\n");
return std::vector<ov::ProfilingInfo>{};
};

std::vector<ov::SoPtr<ov::IVariableState>> LlamaCppSyncInferRequest::query_state() const {
OPENVINO_DEBUG << "llama_cpp_plugin: query_state() called\n";
OPENVINO_DEBUG("llama_cpp_plugin: query_state() called\n");
return {std::static_pointer_cast<ov::IVariableState>(std::make_shared<LlamaCppState>(m_llama_ctx))};
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,6 @@ using namespace ov::test::behavior;

namespace {

//
// OV Class Common tests with <pluginName, device_name params>
//

INSTANTIATE_TEST_SUITE_P(smoke_OVClassNetworkTestP,
OVClassModelTestP,
::testing::Values(ov::test::utils::DEVICE_NVIDIA));

//
// OV Class GetMetric
//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ std::vector<std::string> disabledTestPatterns() {
R"(.*InferRequestIOBBlobTest.*canProcessDeallocatedOutputBlobAfterGetAndSetBlob.*)",
// 119703
R"(.*smoke_GroupConvolutionBias(Add|AddAdd)_2D_ExplicitPaddingSymmetric2.*FP16*.*)",
// Issue: 128924
R"(.*smoke_OVClassNetworkTestP/OVClassModelTestP.ImportModelWithNullContextThrows.*)",
};

#ifdef _WIN32
Expand Down