upload all project

3bb240fb · Cao Duc Anh · 76d75890 · 3bb240fb · 3bb240fb · 3bb240fb
Commit 3bb240fb authored Jul 11, 2024 by Cao Duc Anh
39 changed files
--- a/README.md
+++ b/README.md
--- a/config.yaml
+++ b/config.yaml
+device: cuda
+classes: ["khac", "phan_dong", "thu_ghet", "khieu_dam"]
+model_checkpoint: /src/phobert-base/checkpoint_best.pth
+chunk_size: 64
+limit_infer_length: 10000
+
+vocab: 'aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~ '
+
+minio:
+  server: minio:9000
+  data_labeled: data-annotated
+  model_trained: model
+
+sqldb:
+  server: sqldb:5432
+  table: data_annotated
+
+vncorenlp:
+  save_dir: /src/VnCoreNLP/
+
+phobert_base:
+  save_dir: /src/phobert-base/
+  max_token_length: 256
+
+training: 
+  epoch: 100
+  batch_size: 8
+  load_data_worker: 2
+  k_fold: 5
+  test_ratio: 0.1
--- a/data.Dockerfile
+++ b/data.Dockerfile
+FROM python:3.11
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=True \
+    PORT=9090
+
+# Install dependencies
+RUN apt-get update \
+    && apt-get install -y git 
+
+WORKDIR /src
+COPY ./server_manage_data/requirements.txt /src/
+RUN pip install --no-cache-dir -r requirements.txt
+COPY ./server_manage_data/*.py /src/
+
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8001"]
\ No newline at end of file
--- a/data/chong_pha_nha_nuoc.xlsx
+++ b/data/chong_pha_nha_nuoc.xlsx
--- a/data/crawl_data.ipynb
+++ b/data/crawl_data.ipynb
--- a/data/data_all.csv
+++ b/data/data_all.csv
--- a/data/list_data_sources.json
+++ b/data/list_data_sources.json
+[
+    {
+        "url": "https://truyensexcogiao.com/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/2/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/3/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/4/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/5/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/6/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/7/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/8/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/9/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/10/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/11/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/12/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/13/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/14/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensexcogiao.com/page/15/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensex88.net/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensex88.net/page/2/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensex88.net/page/3/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensex88.net/page/4/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensex88.net/page/5/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensex88.net/page/6/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensex88.net/page/7/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensex88.net/page/8/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    },
+    {
+        "url": "https://truyensex88.net/page/9/",
+        "div_class": "noibat",
+        "child_div_class": "ndtruyen"
+    }
+]
--- a/data/ngon_tu_thu_ghet_all.xlsx
+++ b/data/ngon_tu_thu_ghet_all.xlsx
--- a/data/pre_anotation.ipynb
+++ b/data/pre_anotation.ipynb
--- a/data/tu_khieu_dam.txt
+++ b/data/tu_khieu_dam.txt
+địt
+xuất tinh
+dương vật
+âm đạo
+dâm thủy
+lồn
+cặc
+buồi
+đụ
+xoa vú
+nứng
+cặp mông
+làm tình
+mông
+hậu môn
+bú chim
+chim to
+nhấp liên tục
+con cu
+tử cung
+bím
+hột le
+đầu vú
+bầu vú
+nắc liên tục
+núm vú
+âm hộ
+bú vú
+cặp nhũ hoa
+cặp vú
+bóp chặt cu
+truyensex
+vào háng
--- a/data/van_ban_khieu_dam.xlsx
+++ b/data/van_ban_khieu_dam.xlsx
--- a/docker-compose.yml
+++ b/docker-compose.yml
+version: '3.9' 
+
+# Settings and configurations that are common for containers
+x-nlpcore-common: &nlpcore-common
+  image: vn-text-moderation:latest
+  restart: always 
+  env_file: 
+    - env_file/minio.env
+  depends_on:
+    - minio
+  volumes:
+      - ./config.yaml:/src/config.yaml
+  deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['0']
+              capabilities: [gpu]
+
+services:
+  minio:
+    image: minio/minio
+    restart: always
+    env_file: 
+      - env_file/minio.env
+    ports:
+      - "9090:9000"
+      - "9091:9001"
+    volumes:
+      - ./minio_data:/data
+    command: server --console-address ":9001" /data
+
+  sqldb:
+    image: postgres:13
+    restart: always
+    env_file: 
+      - env_file/sql.env
+      - env_file/minio.env
+    ports:
+      - "5432:5432"
+    volumes:
+      - ./postgres_data:/var/lib/postgresql/data
+
+  adminer:
+    image: adminer
+    environment:
+      ADMINER_DEFAULT_SERVER: sqldb
+    ports:
+      - 8080:8080
+
+  datamanager:
+    image: vn-text-moderation-data
+    restart: always 
+    env_file: 
+      - env_file/sql.env
+      - env_file/minio.env
+    depends_on:
+      - minio
+      - sqldb
+    volumes:
+      - ./config.yaml:/src/config.yaml
+    ports:
+      - "8008:8001"
+
+  nlpcore01:
+    <<: *nlpcore-common
+    hostname: nlpcore01 
+    ports: 
+      - "8002:8001"
+
+  nlpcore02:
+    <<: *nlpcore-common
+    hostname: nlpcore02 
+    ports: 
+      - "8003:8001"
+
+  # Load balancing API use nginx
+  nginx:
+    image: nginx:1.25.0
+    restart: always 
+    depends_on:
+      - nlpcore01
+      - nlpcore02
+    volumes:
+      - ./nginx/conf.d:/etc/nginx/conf.d
+      - ./nginx/log:/var/log/nginx/
+    ports:
+      - "8001:8001"
+
+  nlptraining:
+    image: vn-text-moderation-train:latest
+    restart: always
+    env_file: 
+      - env_file/minio.env
+    volumes:
+      - ./config.yaml:/src/config.yaml
+      - ./runs:/runs
+    ports: 
+      - "8000:8000"
+    deploy:
+        resources:
+          reservations:
+            devices:
+              - driver: nvidia
+                device_ids: ['0']
+                capabilities: [gpu]
+
+  tensorboard:
+    image: tensorflow/tensorflow:latest-py3
+    command: tensorboard --logdir=/logs --host 0.0.0.0
+    ports:
+      - "6006:6006"
+    volumes:
+      - ./runs:/logs  # Thư mục chứa log của TensorBoard
--- a/env_file/minio.env
+++ b/env_file/minio.env
+MINIO_ROOT_USER=vivas
+MINIO_ROOT_PASSWORD=pad12345
\ No newline at end of file
--- a/env_file/sql.env
+++ b/env_file/sql.env
+POSTGRES_USER=vivas
+POSTGRES_PASSWORD=pad12345
+POSTGRES_DB=text_moderation
\ No newline at end of file
--- a/helper_tools/edit_data.ipynb
+++ b/helper_tools/edit_data.ipynb
--- a/infer.Dockerfile
+++ b/infer.Dockerfile
+FROM pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=True \
+    PORT=9090
+
+# Install dependencies
+RUN apt-get update \
+    && apt-get install -y git default-jre default-jdk
+
+WORKDIR /src
+RUN git clone https://github.com/vncorenlp/VnCoreNLP.git
+RUN git clone https://huggingface.co/vinai/phobert-base/
+COPY ./phobert-base/pytorch_model.bin /src/phobert-base/pytorch_model.bin 
+COPY ./server_infer/requirements.txt /src/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+COPY ./server_infer/*.py /src/
+
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8001"]
\ No newline at end of file
--- a/lab/finetune_bert_classify.ipynb
+++ b/lab/finetune_bert_classify.ipynb
--- a/lab/preprocess_data.ipynb
+++ b/lab/preprocess_data.ipynb
--- a/nginx/conf.d/manager.conf
+++ b/nginx/conf.d/manager.conf
+upstream manager {
+    server nlpcore01:8001;
+    server nlpcore02:8001;
+}
+
+server {
+    listen 8001;
+
+    location / {
+        proxy_set_header        X-Real-IP  $remote_addr;
+        proxy_set_header        Host $host;
+        proxy_connect_timeout   1;
+        proxy_pass              http://manager;
+    }
+}
--- a/nginx/log/access.log
+++ b/nginx/log/access.log
+10.3.3.60 - - [03/Jul/2024:09:42:06 +0000] "POST /text-classify HTTP/1.1" 502 157 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [03/Jul/2024:10:00:38 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [03/Jul/2024:10:01:27 +0000] "POST /text-classify HTTP/1.1" 499 0 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [03/Jul/2024:10:01:29 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [04/Jul/2024:08:30:25 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [04/Jul/2024:08:38:10 +0000] "\xFF\xF4\xFF\xFD\x06" 400 157 "-" "-" "-"
+10.3.3.60 - - [04/Jul/2024:08:38:20 +0000] "]" 400 157 "-" "-" "-"
+10.3.3.60 - - [04/Jul/2024:08:41:38 +0000] "POST /start-training HTTP/1.1" 404 22 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [05/Jul/2024:03:50:04 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [05/Jul/2024:04:10:46 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [05/Jul/2024:04:10:49 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [05/Jul/2024:04:10:50 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [05/Jul/2024:04:10:52 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [05/Jul/2024:04:10:53 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [05/Jul/2024:04:10:53 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [05/Jul/2024:04:10:54 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [05/Jul/2024:04:10:55 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [05/Jul/2024:04:10:56 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [05/Jul/2024:09:29:33 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [05/Jul/2024:10:25:57 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [09/Jul/2024:09:46:05 +0000] "POST /text-classify HTTP/1.1" 200 2323 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [09/Jul/2024:09:46:09 +0000] "POST /text-classify HTTP/1.1" 200 2328 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [09/Jul/2024:09:46:10 +0000] "POST /text-classify HTTP/1.1" 200 2326 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [09/Jul/2024:09:46:11 +0000] "POST /text-classify HTTP/1.1" 200 2327 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [09/Jul/2024:09:46:12 +0000] "POST /text-classify HTTP/1.1" 200 2326 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [09/Jul/2024:09:56:27 +0000] "POST /text-classify HTTP/1.1" 200 2306 "-" "PostmanRuntime/7.37.3" "-"
+10.3.3.60 - - [09/Jul/2024:10:33:07 +0000] "POST /text-classify HTTP/1.1" 200 2306 "-" "PostmanRuntime/7.37.3" "-"
+10.3.2.100 - - [10/Jul/2024:06:52:22 +0000] "POST /text-classify HTTP/1.1" 200 2313 "-" "PostmanRuntime/7.37.3" "-"
+10.3.2.100 - - [10/Jul/2024:08:40:49 +0000] "POST /text-classify HTTP/1.1" 200 2313 "-" "PostmanRuntime/7.37.3" "-"
+10.3.2.100 - - [10/Jul/2024:08:40:52 +0000] "POST /text-classify HTTP/1.1" 200 2313 "-" "PostmanRuntime/7.37.3" "-"
+10.3.2.100 - - [10/Jul/2024:09:59:05 +0000] "POST /text-classify HTTP/1.1" 200 2300 "-" "curl/7.81.0" "-"
+10.3.2.100 - - [10/Jul/2024:10:23:22 +0000] "POST /text-classify HTTP/1.1" 502 157 "-" "PostmanRuntime/7.37.3" "-"
+10.3.2.100 - - [10/Jul/2024:10:23:25 +0000] "POST /text-classify HTTP/1.1" 502 157 "-" "PostmanRuntime/7.37.3" "-"
+10.3.2.100 - - [10/Jul/2024:10:23:29 +0000] "POST /text-classify HTTP/1.1" 502 157 "-" "PostmanRuntime/7.37.3" "-"
+10.3.2.100 - - [10/Jul/2024:10:23:35 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-"
+10.3.2.100 - - [11/Jul/2024:06:33:03 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-"
+10.3.2.100 - - [11/Jul/2024:06:41:13 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-"
--- a/nginx/log/error.log
+++ b/nginx/log/error.log
--- a/server_infer/bert_model.py
+++ b/server_infer/bert_model.py
+import torch.nn as nn
+from transformers import AutoModel
+
+class BERTClassifier(nn.Module):
+    def __init__(self, model_bert, n_classes):
+        super(BERTClassifier, self).__init__()
+        self.bert = AutoModel.from_pretrained(model_bert, local_files_only=True)
+        self.drop = nn.Dropout(p=0.3)
+        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
+        nn.init.normal_(self.fc.weight, std=0.02)
+        nn.init.normal_(self.fc.bias, 0)
+
+    def forward(self, input_ids, attention_mask):
+        _, output = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=False # Dropout will errors if without this
+        )
+
+        x = self.drop(output)
+        x = self.fc(x)
+        return x
--- a/server_infer/preprocess.py
+++ b/server_infer/preprocess.py
+import py_vncorenlp
+from utils import get_data_from_yaml
+
+config = get_data_from_yaml("config.yaml")
+VNCORENLP_DIR = config.get("vncorenlp")["save_dir"]
+
+
+# Load the word and sentence segmentation component
+rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=['wseg'], save_dir=VNCORENLP_DIR)
+
+def preprocess_text(text):
+    content = rdrsegmenter.word_segment(text)
+    content = ''.join(content)
+    return content
\ No newline at end of file
--- a/server_infer/requirements.txt
+++ b/server_infer/requirements.txt
+py_vncorenlp
+fastapi
+uvicorn
+numpy
+transformers
+minio==7.2.7
\ No newline at end of file
--- a/server_infer/server.py
+++ b/server_infer/server.py
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import torch
+from transformers import AutoTokenizer
+from contextlib import asynccontextmanager
+from minio import Minio
+from minio.error import S3Error
+import os
+from typing import List
+
+from bert_model import BERTClassifier
+from utils import get_data_from_yaml, split_chunk
+from preprocess import preprocess_text
+
+config = get_data_from_yaml("/src/config.yaml")
+DEVICE = config.get("device")
+CLASSES = config.get("classes")
+MINIO_SERVER = config.get("minio")["server"]
+MINIO_DATA_LABELED = config.get("minio")["data_labeled"]
+MINIO_MODEL_TRAINED = config.get("minio")["model_trained"]
+VNCORENLP_DIR = config.get("vncorenlp")["save_dir"]
+PHOBERTBASE_DIR = config.get("phobert_base")["save_dir"]
+MAX_TOKEN_LENGTH = config.get("phobert_base")["max_token_length"]
+MODEL_CHECKPOINT = config.get("model_checkpoint")
+CHUNK_SIZE = config.get("chunk_size")
+INFER_LENGTH = config.get("limit_infer_length")
+
+minio_client = Minio(
+    endpoint=MINIO_SERVER,
+    access_key=os.getenv("MINIO_ROOT_USER"),
+    secret_key=os.getenv("MINIO_ROOT_PASSWORD"),
+    secure=False
+)
+
+def download_latest_model():
+    # List objects in the bucket
+    objects = minio_client.list_objects(MINIO_MODEL_TRAINED)
+    latest_obj = None
+    latest_time = None
+    
+    for obj in objects:
+        if "best" in obj.object_name:
+            if latest_time is None or obj.last_modified > latest_time:
+                latest_time = obj.last_modified
+                latest_obj = obj
+
+    if latest_obj is not None:
+        try:
+            minio_client.fget_object(MINIO_MODEL_TRAINED, latest_obj.object_name, MODEL_CHECKPOINT)
+        except S3Error as exc:
+            print(f"Error occurred: {exc}")
+        return latest_obj.object_name
+    else:
+        raise Exception("No *best* models found in the bucket")
+
+
+
+tokenizer = AutoTokenizer.from_pretrained(PHOBERTBASE_DIR, local_files_only=True, use_fast=False)
+
+def infer(text, model, tokenizer, class_names, max_len=MAX_TOKEN_LENGTH+2):
+    encoded_review = tokenizer.encode_plus(
+        text,
+        max_length=max_len,
+        truncation=True,
+        add_special_tokens=True,
+        padding='max_length',
+        return_attention_mask=True,
+        return_token_type_ids=False,
+        return_tensors='pt',
+    )
+
+    input_ids = encoded_review['input_ids'].to(DEVICE)
+    attention_mask = encoded_review['attention_mask'].to(DEVICE)
+
+    output = model(input_ids, attention_mask)
+    conf, y_pred = torch.max(output, dim=1)
+
+    return conf, class_names[y_pred]
+
+
+model = BERTClassifier(model_bert=PHOBERTBASE_DIR, n_classes=len(CLASSES))
+model.to(DEVICE)
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global model
+    try:
+        update_model = download_latest_model()
+        model.load_state_dict(torch.load(MODEL_CHECKPOINT))
+        model.eval()
+        print(f"Model updated: {update_model}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    yield
+    torch.cuda.empty_cache()
+    torch.cuda.ipc_collect()
+
+class ParagraphRequest(BaseModel):
+    paragraph: str
+
+class ChunkLabelResponse(BaseModel):
+    chunk: str
+    label: str
+    confidence: float
+
+app = FastAPI(lifespan=lifespan)
+
+@app.post("/text-classify", response_model=List[ChunkLabelResponse])
+async def process_paragraph(request: ParagraphRequest):
+    if len(request.paragraph) > INFER_LENGTH:
+        raise HTTPException(status_code=400, detail=f"Max length: {INFER_LENGTH}")
+    chunks = split_chunk(request.paragraph, max_words=CHUNK_SIZE)
+    response = []
+    for chunk in chunks:
+        text = preprocess_text(chunk)
+        confidence, processed_label = infer(text, model, tokenizer, CLASSES)
+        response.append(ChunkLabelResponse(chunk=chunk, label=processed_label, confidence=confidence))
+    return response
+
+
--- a/server_infer/utils.py
+++ b/server_infer/utils.py
+import yaml 
+import re
+
+def get_data_from_yaml(filename):
+    try:
+        with open(filename, 'r') as f:
+            data = yaml.safe_load(f)
+    except IOError:
+        raise IOError(f"Error opening file: {filename}")
+
+    return data
+
+def split_chunk(text, max_words=200):
+    # Regular expression to match sentences and newlines
+    sentence_endings = re.compile(r'([.!?])\s+|\n')
+    
+    # Split text into segments based on the regular expression
+    segments = sentence_endings.split(text)
+    
+    paragraphs = []
+    current_paragraph = []
+    current_word_count = 0
+    
+    for segment in segments:
+        if segment is not None:
+            words = segment.split()
+            word_count = len(words)
+            
+            # Check if adding this segment would exceed the max_words limit
+            if current_word_count + word_count > max_words:
+                # If so, finalize the current paragraph and start a new one
+                paragraphs.append(' '.join(current_paragraph))
+                current_paragraph = words
+                current_word_count = word_count
+            else:
+                # Add the segment to the current paragraph
+                current_paragraph.extend(words)
+                current_word_count += word_count
+    
+    # Add the last paragraph if any
+    if current_paragraph:
+        paragraphs.append(' '.join(current_paragraph))
+    
+    return paragraphs
--- a/server_manage_data/config.py
+++ b/server_manage_data/config.py
+from utils import get_data_from_yaml
+
+config = get_data_from_yaml("/src/config.yaml")
+DB_SERVER = config.get("sqldb")["server"]
+DB_TABLENAME = config.get("sqldb")["table"]
+MINIO_SERVER = config.get("minio")["server"]
+MINIO_DATA_LABELED = config.get("minio")["data_labeled"]
+MINIO_MODEL_TRAINED = config.get("minio")["model_trained"]
\ No newline at end of file
--- a/server_manage_data/database.py
+++ b/server_manage_data/database.py
+from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
+from sqlalchemy.orm import sessionmaker
+import os
+from config import DB_SERVER
+
+DATABASE_URL = f'postgresql+asyncpg://{os.getenv("POSTGRES_USER")}:{os.getenv("POSTGRES_PASSWORD")}@{DB_SERVER}/{os.getenv("POSTGRES_DB")}'
+
+engine = create_async_engine(DATABASE_URL, echo=True)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine, class_=AsyncSession)
+
+async def get_db():
+    async with SessionLocal() as session:
+        yield session
--- a/server_manage_data/models.py
+++ b/server_manage_data/models.py
+from sqlalchemy import Column, Integer, String
+from sqlalchemy.ext.declarative import declarative_base
+
+from config import DB_TABLENAME
+
+Base = declarative_base()
+
+class LabeledData(Base):
+    __tablename__ = DB_TABLENAME
+    id = Column(Integer, primary_key=True, index=True)
+    paragraph = Column(String, nullable=False)
+    label = Column(String, nullable=False)
--- a/server_manage_data/requirements.txt
+++ b/server_manage_data/requirements.txt
+fastapi 
+sqlalchemy 
+asyncpg 
+psycopg2-binary
+minio==7.2.7
+pandas==2.2.1
--- a/server_manage_data/schemas.py
+++ b/server_manage_data/schemas.py
+from pydantic import BaseModel
+
+class LabeledDataCreate(BaseModel):
+    paragraph: str
+    label: str
--- a/server_manage_data/server.py
+++ b/server_manage_data/server.py
+from fastapi import FastAPI, Depends, HTTPException
+from contextlib import asynccontextmanager
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+from sqlalchemy import MetaData, Table, inspect
+from minio import Minio
+from minio.error import S3Error
+import os
+
+from models import Base, LabeledData
+from schemas import LabeledDataCreate
+from sqlalchemy import create_engine, MetaData, Table
+from sqlalchemy.orm import sessionmaker
+import pandas as pd
+from io import StringIO, BytesIO
+from database import engine, get_db, SessionLocal
+from config import MINIO_SERVER, MINIO_MODEL_TRAINED, MINIO_DATA_LABELED
+from utils import check_bucket
+
+minio_client = Minio(
+    endpoint=MINIO_SERVER,
+    access_key=os.getenv("MINIO_ROOT_USER"),
+    secret_key=os.getenv("MINIO_ROOT_PASSWORD"),
+    secure=False
+)
+bucket_names = [MINIO_DATA_LABELED, MINIO_MODEL_TRAINED]
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    check_bucket(minio_client=minio_client, bucket_names=bucket_names)
+    async with engine.begin() as conn:
+        await conn.run_sync(Base.metadata.create_all)
+    yield
+
+app = FastAPI(lifespan=lifespan)
+
+@app.post("/add-data", response_model=LabeledDataCreate)
+async def create_labeled_data(data: LabeledDataCreate, db: AsyncSession = Depends(get_db)):
+    new_data = LabeledData(paragraph=data.paragraph, label=data.label)
+    try:
+        db.add(new_data)
+        await db.commit()
+        await db.refresh(new_data)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    return new_data
+
--- a/server_manage_data/utils.py
+++ b/server_manage_data/utils.py
+import yaml 
+
+def get_data_from_yaml(filename):
+    try:
+        with open(filename, 'r') as f:
+            data = yaml.safe_load(f)
+    except IOError:
+        raise IOError(f"Error opening file: {filename}")
+
+    return data
+
+def check_bucket(minio_client, bucket_names):
+    # Check if bucket exists
+    for bucket_name in bucket_names:
+        found = minio_client.bucket_exists(bucket_name)
+        if not found:
+            # Create bucket
+            minio_client.make_bucket(bucket_name)
+            print(f"Bucket '{bucket_name}' created successfully.")
+        else:
+            print(f"Bucket '{bucket_name}' already exists.")
\ No newline at end of file
--- a/server_train/bert_model.py
+++ b/server_train/bert_model.py
+import torch.nn as nn
+from transformers import AutoModel
+
+class BERTClassifier(nn.Module):
+    def __init__(self, model_bert, n_classes):
+        super(BERTClassifier, self).__init__()
+        self.bert = AutoModel.from_pretrained(model_bert, local_files_only=True)
+        self.drop = nn.Dropout(p=0.3)
+        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
+        nn.init.normal_(self.fc.weight, std=0.02)
+        nn.init.normal_(self.fc.bias, 0)
+
+    def forward(self, input_ids, attention_mask):
+        _, output = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=False # Dropout will errors if without this
+        )
+
+        x = self.drop(output)
+        x = self.fc(x)
+        return x
--- a/server_train/preprocess.py
+++ b/server_train/preprocess.py
+import py_vncorenlp
+
+from utils import get_data_from_yaml
+
+config = get_data_from_yaml("/src/config.yaml")
+CLASSES = config.get("classes")
+VNCORENLP_DIR = config.get("vncorenlp")["save_dir"]
+
+# Load the word and sentence segmentation component
+rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=['wseg'], save_dir=VNCORENLP_DIR)
+
+def preprocess_row(row):
+    # Kiểm tra xem label có hợp lệ không 
+    if row['label'] in CLASSES:
+        content = rdrsegmenter.word_segment(row['text'])
+        content = ''.join(content)
+        return content
+    else:
+        # Nếu label không hợp lệ, thay đổi text
+        return None
\ No newline at end of file
--- a/server_train/requirements.txt
+++ b/server_train/requirements.txt
+celery[redis]
+py_vncorenlp==0.1.4
+fastapi
+uvicorn
+numpy==1.24.3
+transformers==4.39.2
+pandas==2.2.1
+minio==7.2.7
+scikit-learn==1.4.1.post1
+scipy==1.12.0
+gensim==4.3.2
+tensorboard
\ No newline at end of file
--- a/server_train/server.py
+++ b/server_train/server.py
--- a/server_train/utils.py
+++ b/server_train/utils.py
+import numpy as np
+import torch
+import yaml 
+
+def get_data_from_yaml(filename):
+    try:
+        with open(filename, 'r') as f:
+            data = yaml.safe_load(f)
+    except IOError:
+        raise IOError(f"Error opening file: {filename}")
+
+    return data
+
+def seed_everything(seed_value):
+    np.random.seed(seed_value)
+    torch.manual_seed(seed_value)
+    
+    if torch.cuda.is_available(): 
+        print("Torch available")
+        torch.cuda.manual_seed(seed_value)
+        torch.cuda.manual_seed_all(seed_value)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = True
--- a/train.Dockerfile
+++ b/train.Dockerfile
+FROM pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=True \
+    PORT=9090
+
+# Install dependencies
+RUN apt-get update \
+    && apt-get install -y git default-jre default-jdk
+
+WORKDIR /src
+RUN git clone https://github.com/vncorenlp/VnCoreNLP.git
+RUN git clone https://huggingface.co/vinai/phobert-base/
+COPY ./phobert-base/pytorch_model.bin /src/phobert-base/pytorch_model.bin
+COPY ./server_train/requirements.txt /src/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+COPY ./server_train/*.py /src/
+
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
\ No newline at end of file