Commit 107ec12c authored by Cao Duc Anh's avatar Cao Duc Anh

close job 22072024

parent 5b96e8f0
......@@ -16,7 +16,28 @@ Các file dữ liệu sẵn sàng cho huấn luyện ở định dạng *.csv g
- text: dữ liệu văn bản
- label: nhãn phân loại, chữ thường không dấu, phân cách từ bằng dấu "_"
## Hướng dẫn triển khai nhanh
## Hướng dẫn triển khai với registry vivas
### 1. Pull docker images
Pull images with robot account:
- vietnam_text_moderation/vn-text-moderation-data
- vietnam_text_moderation/vn-text-moderation
- vietnam_text_moderation/vn-text-moderation-train
Robot account: <br>
Name
```
robot$vietnam_text_moderation+deploytextmoderation
```
Secret
```
0ZX8kHmvm1OwGoIGeOBHXopsYh8bl2OC
```
### 2. Run
```
docker compose up
```
## Hướng dẫn triển khai từ đầu
### 1. Download model pretrain
Download pytorch-model.bin: https://huggingface.co/vinai/phobert-base/resolve/main/pytorch_model.bin?download=true <br>
......@@ -29,7 +50,7 @@ docker build -f infer.Dockerfile -t vn-text-moderation .
```
Server training
```
docker build -f train.Dockerfile -t vn-text-moderation-train .
docker build -f train.Dockerfile -t registry.vivas.vn/vietnam_text_moderation/vn-text-moderation-train .
```
Server manage data
```
......
......@@ -89,7 +89,7 @@ services:
- "8001:8001"
nlptraining:
image: vn-text-moderation-train:latest
image: registry.vivas.vn/vietnam_text_moderation/vn-text-moderation-train:latest
restart: always
env_file:
- env_file/minio.env
......
......@@ -37,3 +37,4 @@
10.3.2.100 - - [11/Jul/2024:06:41:13 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [15/Jul/2024:02:13:07 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [15/Jul/2024:02:13:22 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [22/Jul/2024:03:13:02 +0000] "POST /text-classify HTTP/1.1" 200 2301 "-" "PostmanRuntime/7.37.3" "-"
This diff is collapsed.
......@@ -7,10 +7,10 @@ from typing import List
from bert_model import BERTClassifier
from utils import get_data_from_yaml, split_chunk
from preprocess import preprocess_text
from minio_client import download_latest_model
from constants import PHOBERTBASE_DIR, MAX_TOKEN_LENGTH, DEVICE, CLASSES, MODEL_CHECKPOINT, INFER_LENGTH, CHUNK_SIZE
from logger import logger
from preprocess import preprocess_text
tokenizer = AutoTokenizer.from_pretrained(PHOBERTBASE_DIR, local_files_only=True, use_fast=False)
......
......@@ -12,7 +12,7 @@ minio_client = Minio(
)
bucket_names = [MINIO_DATA_LABELED, MINIO_MODEL_TRAINED]
def check_bucket(minio_client, bucket_names):
def check_bucket(minio_client=minio_client, bucket_names=bucket_names):
# Check if bucket exists
for bucket_name in bucket_names:
found = minio_client.bucket_exists(bucket_name)
......
......@@ -21,12 +21,12 @@ from minio.error import S3Error
from bert_model import BERTClassifier
from utils import seed_everything
from preprocess import preprocess_row
from minio_client import minio_client, download_latest_model
from constants import PHOBERTBASE_DIR, CLASSES, DEVICE, MODEL_CHECKPOINT, EPOCH, \
MAX_TOKEN_LENGTH, BATCH_SIZE, LOAD_DATA_WORKER, MINIO_MODEL_TRAINED, \
MINIO_DATA_LABELED, TEST_RATIO, K_FOLD
from logger import logger
from preprocess import preprocess_row
# Global variables to manage the training thread and the stop flag
train_thread = None
......@@ -365,7 +365,9 @@ async def start_training(request: TrainingRequest):
except Exception as exc:
is_training = False
raise HTTPException(status_code=500, detail=str(exc))
if len(dataframes) == 0:
is_training = False
raise HTTPException(status_code=500, detail="Can not get any data file")
df = pd.concat(dataframes)
df = df.dropna()
......
......@@ -2,8 +2,6 @@ import numpy as np
import torch
import yaml
from logger import logger
def get_data_from_yaml(filename):
try:
with open(filename, 'r') as f:
......@@ -18,7 +16,7 @@ def seed_everything(seed_value):
torch.manual_seed(seed_value)
if torch.cuda.is_available():
logger.info("Torch available. Start seed_everything.")
print("Torch available. Start seed_everything.")
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
torch.backends.cudnn.deterministic = True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment