-
Notifications
You must be signed in to change notification settings - Fork 436
feat[accl-barex]: add barex_transport by build with USE_BAREX #1045
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
91d661f
3274770
5e9f409
cd1ed0f
e206424
947ee6c
4c3678b
3a26b6f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -94,6 +94,6 @@ def get_allocator(cls, device: torch_device) -> CUDAPluggableAllocator: | |
| if device not in cls._instances: | ||
| so_path = cls._get_so_path() | ||
| cls._instances[device] = CUDAPluggableAllocator( | ||
| so_path, "u2mm_alloc_wrapper", "u2mm_free_wrapper" | ||
| so_path, "u2mm_alloc_wrapper_with_stream", "u2mm_free_wrapper_with_stream" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure if this has a compatible issue
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't worry. It works. |
||
| ) | ||
| return cls._instances[device] | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -132,7 +132,17 @@ int TransferEnginePy::initializeExt(const char *local_hostname, | |||||||||||||||||||||||||||||||
| free_list_.resize(kSlabSizeKBTabLen); | ||||||||||||||||||||||||||||||||
| #if !defined(USE_ASCEND) && !defined(USE_ASCEND_DIRECT) && \ | ||||||||||||||||||||||||||||||||
| !defined(USE_ASCEND_HETEROGENEOUS) | ||||||||||||||||||||||||||||||||
| doBuddyAllocate(kMaxClassId); | ||||||||||||||||||||||||||||||||
| bool pass_alloc = false; | ||||||||||||||||||||||||||||||||
| const char *pass_alloc_env = std::getenv("PASS_ALLOC"); | ||||||||||||||||||||||||||||||||
| if (pass_alloc_env) { | ||||||||||||||||||||||||||||||||
| int val = atoi(pass_alloc_env); | ||||||||||||||||||||||||||||||||
| if (val != 0) { | ||||||||||||||||||||||||||||||||
| pass_alloc = true; | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
Comment on lines
137
to
146
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The use of
Suggested change
|
||||||||||||||||||||||||||||||||
| if (!pass_alloc) { | ||||||||||||||||||||||||||||||||
| doBuddyAllocate(kMaxClassId); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
| #endif | ||||||||||||||||||||||||||||||||
| return 0; | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
@@ -266,6 +276,7 @@ int TransferEnginePy::transferSync(const char *target_hostname, | |||||||||||||||||||||||||||||||
| if (handle_map_.count(target_hostname)) { | ||||||||||||||||||||||||||||||||
| handle = handle_map_[target_hostname]; | ||||||||||||||||||||||||||||||||
| } else { | ||||||||||||||||||||||||||||||||
| LOG(INFO) << "transferSync, cache not found, openSegment with target " << target_hostname; | ||||||||||||||||||||||||||||||||
| handle = engine_->openSegment(target_hostname); | ||||||||||||||||||||||||||||||||
| if (handle == (Transport::SegmentHandle)-1) return -1; | ||||||||||||||||||||||||||||||||
| handle_map_[target_hostname] = handle; | ||||||||||||||||||||||||||||||||
|
|
@@ -300,7 +311,17 @@ int TransferEnginePy::transferSync(const char *target_hostname, | |||||||||||||||||||||||||||||||
| batch_id, {entry}, | ||||||||||||||||||||||||||||||||
| TransferMetadata::NotifyDesc{notify->name, notify->msg}) | ||||||||||||||||||||||||||||||||
| : engine_->submitTransfer(batch_id, {entry}); | ||||||||||||||||||||||||||||||||
| if (!s.ok()) return -1; | ||||||||||||||||||||||||||||||||
| if (!s.ok()) { | ||||||||||||||||||||||||||||||||
| Status segment_status = engine_->CheckSegmentStatus(handle); | ||||||||||||||||||||||||||||||||
| if (!segment_status.ok()) { | ||||||||||||||||||||||||||||||||
| LOG(WARNING) << "submitTransfer failed with target " << target_hostname << ", CheckSegmentStatus not ok, ready to closeSegment"; | ||||||||||||||||||||||||||||||||
| std::lock_guard<std::mutex> guard(mutex_); | ||||||||||||||||||||||||||||||||
| engine_->closeSegment(handle); | ||||||||||||||||||||||||||||||||
| engine_->getMetadata()->removeSegmentDesc(target_hostname); | ||||||||||||||||||||||||||||||||
| handle_map_.erase(target_hostname); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
Comment on lines
321
to
330
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||||||||||||||||||||||||||||
| return -1; | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| TransferStatus status; | ||||||||||||||||||||||||||||||||
| bool completed = false; | ||||||||||||||||||||||||||||||||
|
|
@@ -387,6 +408,14 @@ int TransferEnginePy::batchTransferSync( | |||||||||||||||||||||||||||||||
| : engine_->submitTransfer(batch_id, entries); | ||||||||||||||||||||||||||||||||
| if (!s.ok()) { | ||||||||||||||||||||||||||||||||
| engine_->freeBatchID(batch_id); | ||||||||||||||||||||||||||||||||
| Status segment_status = engine_->CheckSegmentStatus(handle); | ||||||||||||||||||||||||||||||||
| if (!segment_status.ok()) { | ||||||||||||||||||||||||||||||||
| LOG(WARNING) << "submitTransfer failed with target " << target_hostname << ", CheckSegmentStatus not ok, ready to closeSegment"; | ||||||||||||||||||||||||||||||||
| std::lock_guard<std::mutex> guard(mutex_); | ||||||||||||||||||||||||||||||||
| engine_->closeSegment(handle); | ||||||||||||||||||||||||||||||||
| engine_->getMetadata()->removeSegmentDesc(target_hostname); | ||||||||||||||||||||||||||||||||
| handle_map_.erase(target_hostname); | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code block looks like the same as above
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For each non-OK request, it should check the results. I guess we should wrap this code block with USE_BAREX. |
||||||||||||||||||||||||||||||||
| return -1; | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,194 @@ | ||||||
| // Copyright 2024 KVCache.AI | ||||||
| // | ||||||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
| // you may not use this file except in compliance with the License. | ||||||
| // You may obtain a copy of the License at | ||||||
| // | ||||||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||||||
| // | ||||||
| // Unless required by applicable law or agreed to in writing, software | ||||||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
| // See the License for the specific language governing permissions and | ||||||
| // limitations under the License. | ||||||
|
|
||||||
| #ifndef BAREX_CONTEXT_H_ | ||||||
| #define BAREX_CONTEXT_H_ | ||||||
|
|
||||||
| #include <infiniband/verbs.h> | ||||||
|
|
||||||
| #include <atomic> | ||||||
| #include <cstddef> | ||||||
| #include <map> | ||||||
| #include <memory> | ||||||
| #include <mutex> | ||||||
| #include <string> | ||||||
| #include <unordered_map> | ||||||
| #include <unordered_set> | ||||||
| #include <vector> | ||||||
|
|
||||||
| #include "common.h" | ||||||
| #include "transport/transport.h" | ||||||
|
|
||||||
| #ifdef USE_BAREX | ||||||
| #include <accl/barex/barex.h> | ||||||
| #include <accl/barex/xcontext.h> | ||||||
| #include <accl/barex/xlistener.h> | ||||||
| #include <accl/barex/xconnector.h> | ||||||
| #include <accl/barex/xsimple_mempool.h> | ||||||
| #include <accl/barex/xthreadpool.h> | ||||||
| #include <accl/barex/xtimer.h> | ||||||
| #include <accl/barex/xconfig_util.h> | ||||||
| #endif | ||||||
|
|
||||||
| namespace mooncake { | ||||||
|
|
||||||
| #ifdef USE_BAREX | ||||||
|
|
||||||
| using namespace accl::barex; | ||||||
| using XChannel = accl::barex::XChannel; | ||||||
| using SegmentID = Transport::SegmentID; | ||||||
| using XContext = accl::barex::XContext; | ||||||
| using BarexResult = accl::barex::BarexResult; | ||||||
|
|
||||||
| class ChannelCache { | ||||||
| public: | ||||||
| // 添加一个 channel 到指定 key & nic_id | ||||||
|
||||||
| // 添加一个 channel 到指定 key & nic_id | |
| // Add a channel to the specified key & nic_id |
Outdated
Copilot
AI
Nov 11, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment is in Chinese. Should be translated to English: '// Get the channel for the specified nic_id and idx under sid'.
| // 获取 sid 下指定 nic_id 和 idx 的 channel | |
| // Get the channel for the specified nic_id and idx under sid |
Outdated
Copilot
AI
Nov 11, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment is in Chinese. Should be translated to English: '// Delete a channel (by id and idx)'.
| // 删除某个 channel(通过id和idx) | |
| // Delete a channel (by id and idx) |
Outdated
Copilot
AI
Nov 11, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment is in Chinese. Should be translated to English: '// Check the channel status under a SegmentID'.
| // 查询某个 SegmentID 下的 channel 状态 | |
| // Check the channel status under a SegmentID |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CheckAllChannels appears to be a read-only operation, but it uses a RWSpinlock::WriteGuard. This is inefficient and semantically incorrect. It should use a RWSpinlock::ReadGuard to allow for concurrent reads. The same issue exists in copyAll at line 148.
| RWSpinlock::WriteGuard guard(lock_); | |
| RWSpinlock::ReadGuard guard(lock_); |
Outdated
Copilot
AI
Nov 11, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment is in Chinese. Should be translated to English: '// Check and remove invalid channels under a SegmentID, return the number of removed channels'.
| // 检查并删除某个 SegmentID 下的异常channel,并返回删除的数量 | |
| // Check and remove invalid channels under a SegmentID, return the number of removed channels |
Outdated
Copilot
AI
Nov 11, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment is in Chinese. Should be translated to English: '// Return all channels as a vector'.
| // 将所有的 channel 以 vector 形式返回 | |
| // Return all channels as a vector |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Uh oh!
There was an error while loading. Please reload this page.