Commit 107ec12c authored by Cao Duc Anh's avatar Cao Duc Anh

close job 22072024

parent 5b96e8f0
...@@ -16,7 +16,28 @@ Các file dữ liệu sẵn sàng cho huấn luyện ở định dạng *.csv g ...@@ -16,7 +16,28 @@ Các file dữ liệu sẵn sàng cho huấn luyện ở định dạng *.csv g
- text: dữ liệu văn bản - text: dữ liệu văn bản
- label: nhãn phân loại, chữ thường không dấu, phân cách từ bằng dấu "_" - label: nhãn phân loại, chữ thường không dấu, phân cách từ bằng dấu "_"
## Hướng dẫn triển khai nhanh ## Hướng dẫn triển khai với registry vivas
### 1. Pull docker images
Pull images with robot account:
- vietnam_text_moderation/vn-text-moderation-data
- vietnam_text_moderation/vn-text-moderation
- vietnam_text_moderation/vn-text-moderation-train
Robot account: <br>
Name
```
robot$vietnam_text_moderation+deploytextmoderation
```
Secret
```
0ZX8kHmvm1OwGoIGeOBHXopsYh8bl2OC
```
### 2. Run
```
docker compose up
```
## Hướng dẫn triển khai từ đầu
### 1. Download model pretrain ### 1. Download model pretrain
Download pytorch-model.bin: https://huggingface.co/vinai/phobert-base/resolve/main/pytorch_model.bin?download=true <br> Download pytorch-model.bin: https://huggingface.co/vinai/phobert-base/resolve/main/pytorch_model.bin?download=true <br>
...@@ -29,7 +50,7 @@ docker build -f infer.Dockerfile -t vn-text-moderation . ...@@ -29,7 +50,7 @@ docker build -f infer.Dockerfile -t vn-text-moderation .
``` ```
Server training Server training
``` ```
docker build -f train.Dockerfile -t vn-text-moderation-train . docker build -f train.Dockerfile -t registry.vivas.vn/vietnam_text_moderation/vn-text-moderation-train .
``` ```
Server manage data Server manage data
``` ```
......
...@@ -89,7 +89,7 @@ services: ...@@ -89,7 +89,7 @@ services:
- "8001:8001" - "8001:8001"
nlptraining: nlptraining:
image: vn-text-moderation-train:latest image: registry.vivas.vn/vietnam_text_moderation/vn-text-moderation-train:latest
restart: always restart: always
env_file: env_file:
- env_file/minio.env - env_file/minio.env
......
...@@ -37,3 +37,4 @@ ...@@ -37,3 +37,4 @@
10.3.2.100 - - [11/Jul/2024:06:41:13 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-" 10.3.2.100 - - [11/Jul/2024:06:41:13 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [15/Jul/2024:02:13:07 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-" 10.3.2.100 - - [15/Jul/2024:02:13:07 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [15/Jul/2024:02:13:22 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-" 10.3.2.100 - - [15/Jul/2024:02:13:22 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [22/Jul/2024:03:13:02 +0000] "POST /text-classify HTTP/1.1" 200 2301 "-" "PostmanRuntime/7.37.3" "-"
This diff is collapsed.
...@@ -7,10 +7,10 @@ from typing import List ...@@ -7,10 +7,10 @@ from typing import List
from bert_model import BERTClassifier from bert_model import BERTClassifier
from utils import get_data_from_yaml, split_chunk from utils import get_data_from_yaml, split_chunk
from preprocess import preprocess_text
from minio_client import download_latest_model from minio_client import download_latest_model
from constants import PHOBERTBASE_DIR, MAX_TOKEN_LENGTH, DEVICE, CLASSES, MODEL_CHECKPOINT, INFER_LENGTH, CHUNK_SIZE from constants import PHOBERTBASE_DIR, MAX_TOKEN_LENGTH, DEVICE, CLASSES, MODEL_CHECKPOINT, INFER_LENGTH, CHUNK_SIZE
from logger import logger from logger import logger
from preprocess import preprocess_text
tokenizer = AutoTokenizer.from_pretrained(PHOBERTBASE_DIR, local_files_only=True, use_fast=False) tokenizer = AutoTokenizer.from_pretrained(PHOBERTBASE_DIR, local_files_only=True, use_fast=False)
......
...@@ -12,7 +12,7 @@ minio_client = Minio( ...@@ -12,7 +12,7 @@ minio_client = Minio(
) )
bucket_names = [MINIO_DATA_LABELED, MINIO_MODEL_TRAINED] bucket_names = [MINIO_DATA_LABELED, MINIO_MODEL_TRAINED]
def check_bucket(minio_client, bucket_names): def check_bucket(minio_client=minio_client, bucket_names=bucket_names):
# Check if bucket exists # Check if bucket exists
for bucket_name in bucket_names: for bucket_name in bucket_names:
found = minio_client.bucket_exists(bucket_name) found = minio_client.bucket_exists(bucket_name)
......
...@@ -21,12 +21,12 @@ from minio.error import S3Error ...@@ -21,12 +21,12 @@ from minio.error import S3Error
from bert_model import BERTClassifier from bert_model import BERTClassifier
from utils import seed_everything from utils import seed_everything
from preprocess import preprocess_row
from minio_client import minio_client, download_latest_model from minio_client import minio_client, download_latest_model
from constants import PHOBERTBASE_DIR, CLASSES, DEVICE, MODEL_CHECKPOINT, EPOCH, \ from constants import PHOBERTBASE_DIR, CLASSES, DEVICE, MODEL_CHECKPOINT, EPOCH, \
MAX_TOKEN_LENGTH, BATCH_SIZE, LOAD_DATA_WORKER, MINIO_MODEL_TRAINED, \ MAX_TOKEN_LENGTH, BATCH_SIZE, LOAD_DATA_WORKER, MINIO_MODEL_TRAINED, \
MINIO_DATA_LABELED, TEST_RATIO, K_FOLD MINIO_DATA_LABELED, TEST_RATIO, K_FOLD
from logger import logger from logger import logger
from preprocess import preprocess_row
# Global variables to manage the training thread and the stop flag # Global variables to manage the training thread and the stop flag
train_thread = None train_thread = None
...@@ -365,7 +365,9 @@ async def start_training(request: TrainingRequest): ...@@ -365,7 +365,9 @@ async def start_training(request: TrainingRequest):
except Exception as exc: except Exception as exc:
is_training = False is_training = False
raise HTTPException(status_code=500, detail=str(exc)) raise HTTPException(status_code=500, detail=str(exc))
if len(dataframes) == 0:
is_training = False
raise HTTPException(status_code=500, detail="Can not get any data file")
df = pd.concat(dataframes) df = pd.concat(dataframes)
df = df.dropna() df = df.dropna()
......
...@@ -2,8 +2,6 @@ import numpy as np ...@@ -2,8 +2,6 @@ import numpy as np
import torch import torch
import yaml import yaml
from logger import logger
def get_data_from_yaml(filename): def get_data_from_yaml(filename):
try: try:
with open(filename, 'r') as f: with open(filename, 'r') as f:
...@@ -18,7 +16,7 @@ def seed_everything(seed_value): ...@@ -18,7 +16,7 @@ def seed_everything(seed_value):
torch.manual_seed(seed_value) torch.manual_seed(seed_value)
if torch.cuda.is_available(): if torch.cuda.is_available():
logger.info("Torch available. Start seed_everything.") print("Torch available. Start seed_everything.")
torch.cuda.manual_seed(seed_value) torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value) torch.cuda.manual_seed_all(seed_value)
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment