Commit 3bb240fb authored by Cao Duc Anh's avatar Cao Duc Anh

upload all project

parent 76d75890
Pipeline #17728 failed with stages
This diff is collapsed.
device: cuda
classes: ["khac", "phan_dong", "thu_ghet", "khieu_dam"]
model_checkpoint: /src/phobert-base/checkpoint_best.pth
chunk_size: 64
limit_infer_length: 10000
vocab: 'aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~ '
minio:
server: minio:9000
data_labeled: data-annotated
model_trained: model
sqldb:
server: sqldb:5432
table: data_annotated
vncorenlp:
save_dir: /src/VnCoreNLP/
phobert_base:
save_dir: /src/phobert-base/
max_token_length: 256
training:
epoch: 100
batch_size: 8
load_data_worker: 2
k_fold: 5
test_ratio: 0.1
FROM python:3.11
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=True \
PORT=9090
# Install dependencies
RUN apt-get update \
&& apt-get install -y git
WORKDIR /src
COPY ./server_manage_data/requirements.txt /src/
RUN pip install --no-cache-dir -r requirements.txt
COPY ./server_manage_data/*.py /src/
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8001"]
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
[
{
"url": "https://truyensexcogiao.com/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/2/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/3/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/4/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/5/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/6/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/7/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/8/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/9/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/10/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/11/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/12/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/13/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/14/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensexcogiao.com/page/15/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensex88.net/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensex88.net/page/2/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensex88.net/page/3/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensex88.net/page/4/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensex88.net/page/5/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensex88.net/page/6/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensex88.net/page/7/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensex88.net/page/8/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
},
{
"url": "https://truyensex88.net/page/9/",
"div_class": "noibat",
"child_div_class": "ndtruyen"
}
]
This diff is collapsed.
địt
xuất tinh
dương vật
âm đạo
dâm thủy
lồn
cặc
buồi
đụ
xoa vú
nứng
cặp mông
làm tình
mông
hậu môn
bú chim
chim to
nhấp liên tục
con cu
tử cung
bím
hột le
đầu vú
bầu vú
nắc liên tục
núm vú
âm hộ
bú vú
cặp nhũ hoa
cặp vú
bóp chặt cu
truyensex
vào háng
version: '3.9'
# Settings and configurations that are common for containers
x-nlpcore-common: &nlpcore-common
image: vn-text-moderation:latest
restart: always
env_file:
- env_file/minio.env
depends_on:
- minio
volumes:
- ./config.yaml:/src/config.yaml
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['0']
capabilities: [gpu]
services:
minio:
image: minio/minio
restart: always
env_file:
- env_file/minio.env
ports:
- "9090:9000"
- "9091:9001"
volumes:
- ./minio_data:/data
command: server --console-address ":9001" /data
sqldb:
image: postgres:13
restart: always
env_file:
- env_file/sql.env
- env_file/minio.env
ports:
- "5432:5432"
volumes:
- ./postgres_data:/var/lib/postgresql/data
adminer:
image: adminer
environment:
ADMINER_DEFAULT_SERVER: sqldb
ports:
- 8080:8080
datamanager:
image: vn-text-moderation-data
restart: always
env_file:
- env_file/sql.env
- env_file/minio.env
depends_on:
- minio
- sqldb
volumes:
- ./config.yaml:/src/config.yaml
ports:
- "8008:8001"
nlpcore01:
<<: *nlpcore-common
hostname: nlpcore01
ports:
- "8002:8001"
nlpcore02:
<<: *nlpcore-common
hostname: nlpcore02
ports:
- "8003:8001"
# Load balancing API use nginx
nginx:
image: nginx:1.25.0
restart: always
depends_on:
- nlpcore01
- nlpcore02
volumes:
- ./nginx/conf.d:/etc/nginx/conf.d
- ./nginx/log:/var/log/nginx/
ports:
- "8001:8001"
nlptraining:
image: vn-text-moderation-train:latest
restart: always
env_file:
- env_file/minio.env
volumes:
- ./config.yaml:/src/config.yaml
- ./runs:/runs
ports:
- "8000:8000"
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['0']
capabilities: [gpu]
tensorboard:
image: tensorflow/tensorflow:latest-py3
command: tensorboard --logdir=/logs --host 0.0.0.0
ports:
- "6006:6006"
volumes:
- ./runs:/logs # Thư mục chứa log của TensorBoard
MINIO_ROOT_USER=vivas
MINIO_ROOT_PASSWORD=pad12345
\ No newline at end of file
POSTGRES_USER=vivas
POSTGRES_PASSWORD=pad12345
POSTGRES_DB=text_moderation
\ No newline at end of file
This diff is collapsed.
FROM pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=True \
PORT=9090
# Install dependencies
RUN apt-get update \
&& apt-get install -y git default-jre default-jdk
WORKDIR /src
RUN git clone https://github.com/vncorenlp/VnCoreNLP.git
RUN git clone https://huggingface.co/vinai/phobert-base/
COPY ./phobert-base/pytorch_model.bin /src/phobert-base/pytorch_model.bin
COPY ./server_infer/requirements.txt /src/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY ./server_infer/*.py /src/
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8001"]
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
upstream manager {
server nlpcore01:8001;
server nlpcore02:8001;
}
server {
listen 8001;
location / {
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header Host $host;
proxy_connect_timeout 1;
proxy_pass http://manager;
}
}
10.3.3.60 - - [03/Jul/2024:09:42:06 +0000] "POST /text-classify HTTP/1.1" 502 157 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [03/Jul/2024:10:00:38 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [03/Jul/2024:10:01:27 +0000] "POST /text-classify HTTP/1.1" 499 0 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [03/Jul/2024:10:01:29 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [04/Jul/2024:08:30:25 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [04/Jul/2024:08:38:10 +0000] "\xFF\xF4\xFF\xFD\x06" 400 157 "-" "-" "-"
10.3.3.60 - - [04/Jul/2024:08:38:20 +0000] "]" 400 157 "-" "-" "-"
10.3.3.60 - - [04/Jul/2024:08:41:38 +0000] "POST /start-training HTTP/1.1" 404 22 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [05/Jul/2024:03:50:04 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [05/Jul/2024:04:10:46 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [05/Jul/2024:04:10:49 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [05/Jul/2024:04:10:50 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [05/Jul/2024:04:10:52 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [05/Jul/2024:04:10:53 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [05/Jul/2024:04:10:53 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [05/Jul/2024:04:10:54 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [05/Jul/2024:04:10:55 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [05/Jul/2024:04:10:56 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [05/Jul/2024:09:29:33 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [05/Jul/2024:10:25:57 +0000] "POST /text-classify HTTP/1.1" 200 2309 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [09/Jul/2024:09:46:05 +0000] "POST /text-classify HTTP/1.1" 200 2323 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [09/Jul/2024:09:46:09 +0000] "POST /text-classify HTTP/1.1" 200 2328 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [09/Jul/2024:09:46:10 +0000] "POST /text-classify HTTP/1.1" 200 2326 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [09/Jul/2024:09:46:11 +0000] "POST /text-classify HTTP/1.1" 200 2327 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [09/Jul/2024:09:46:12 +0000] "POST /text-classify HTTP/1.1" 200 2326 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [09/Jul/2024:09:56:27 +0000] "POST /text-classify HTTP/1.1" 200 2306 "-" "PostmanRuntime/7.37.3" "-"
10.3.3.60 - - [09/Jul/2024:10:33:07 +0000] "POST /text-classify HTTP/1.1" 200 2306 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [10/Jul/2024:06:52:22 +0000] "POST /text-classify HTTP/1.1" 200 2313 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [10/Jul/2024:08:40:49 +0000] "POST /text-classify HTTP/1.1" 200 2313 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [10/Jul/2024:08:40:52 +0000] "POST /text-classify HTTP/1.1" 200 2313 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [10/Jul/2024:09:59:05 +0000] "POST /text-classify HTTP/1.1" 200 2300 "-" "curl/7.81.0" "-"
10.3.2.100 - - [10/Jul/2024:10:23:22 +0000] "POST /text-classify HTTP/1.1" 502 157 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [10/Jul/2024:10:23:25 +0000] "POST /text-classify HTTP/1.1" 502 157 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [10/Jul/2024:10:23:29 +0000] "POST /text-classify HTTP/1.1" 502 157 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [10/Jul/2024:10:23:35 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [11/Jul/2024:06:33:03 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-"
10.3.2.100 - - [11/Jul/2024:06:41:13 +0000] "POST /text-classify HTTP/1.1" 200 2318 "-" "PostmanRuntime/7.37.3" "-"
This diff is collapsed.
import torch.nn as nn
from transformers import AutoModel
class BERTClassifier(nn.Module):
def __init__(self, model_bert, n_classes):
super(BERTClassifier, self).__init__()
self.bert = AutoModel.from_pretrained(model_bert, local_files_only=True)
self.drop = nn.Dropout(p=0.3)
self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
nn.init.normal_(self.fc.weight, std=0.02)
nn.init.normal_(self.fc.bias, 0)
def forward(self, input_ids, attention_mask):
_, output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=False # Dropout will errors if without this
)
x = self.drop(output)
x = self.fc(x)
return x
import py_vncorenlp
from utils import get_data_from_yaml
config = get_data_from_yaml("config.yaml")
VNCORENLP_DIR = config.get("vncorenlp")["save_dir"]
# Load the word and sentence segmentation component
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=['wseg'], save_dir=VNCORENLP_DIR)
def preprocess_text(text):
content = rdrsegmenter.word_segment(text)
content = ''.join(content)
return content
\ No newline at end of file
py_vncorenlp
fastapi
uvicorn
numpy
transformers
minio==7.2.7
\ No newline at end of file
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer
from contextlib import asynccontextmanager
from minio import Minio
from minio.error import S3Error
import os
from typing import List
from bert_model import BERTClassifier
from utils import get_data_from_yaml, split_chunk
from preprocess import preprocess_text
config = get_data_from_yaml("/src/config.yaml")
DEVICE = config.get("device")
CLASSES = config.get("classes")
MINIO_SERVER = config.get("minio")["server"]
MINIO_DATA_LABELED = config.get("minio")["data_labeled"]
MINIO_MODEL_TRAINED = config.get("minio")["model_trained"]
VNCORENLP_DIR = config.get("vncorenlp")["save_dir"]
PHOBERTBASE_DIR = config.get("phobert_base")["save_dir"]
MAX_TOKEN_LENGTH = config.get("phobert_base")["max_token_length"]
MODEL_CHECKPOINT = config.get("model_checkpoint")
CHUNK_SIZE = config.get("chunk_size")
INFER_LENGTH = config.get("limit_infer_length")
minio_client = Minio(
endpoint=MINIO_SERVER,
access_key=os.getenv("MINIO_ROOT_USER"),
secret_key=os.getenv("MINIO_ROOT_PASSWORD"),
secure=False
)
def download_latest_model():
# List objects in the bucket
objects = minio_client.list_objects(MINIO_MODEL_TRAINED)
latest_obj = None
latest_time = None
for obj in objects:
if "best" in obj.object_name:
if latest_time is None or obj.last_modified > latest_time:
latest_time = obj.last_modified
latest_obj = obj
if latest_obj is not None:
try:
minio_client.fget_object(MINIO_MODEL_TRAINED, latest_obj.object_name, MODEL_CHECKPOINT)
except S3Error as exc:
print(f"Error occurred: {exc}")
return latest_obj.object_name
else:
raise Exception("No *best* models found in the bucket")
tokenizer = AutoTokenizer.from_pretrained(PHOBERTBASE_DIR, local_files_only=True, use_fast=False)
def infer(text, model, tokenizer, class_names, max_len=MAX_TOKEN_LENGTH+2):
encoded_review = tokenizer.encode_plus(
text,
max_length=max_len,
truncation=True,
add_special_tokens=True,
padding='max_length',
return_attention_mask=True,
return_token_type_ids=False,
return_tensors='pt',
)
input_ids = encoded_review['input_ids'].to(DEVICE)
attention_mask = encoded_review['attention_mask'].to(DEVICE)
output = model(input_ids, attention_mask)
conf, y_pred = torch.max(output, dim=1)
return conf, class_names[y_pred]
model = BERTClassifier(model_bert=PHOBERTBASE_DIR, n_classes=len(CLASSES))
model.to(DEVICE)
@asynccontextmanager
async def lifespan(app: FastAPI):
global model
try:
update_model = download_latest_model()
model.load_state_dict(torch.load(MODEL_CHECKPOINT))
model.eval()
print(f"Model updated: {update_model}")
except Exception as e:
print(f"An error occurred: {e}")
yield
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
class ParagraphRequest(BaseModel):
paragraph: str
class ChunkLabelResponse(BaseModel):
chunk: str
label: str
confidence: float
app = FastAPI(lifespan=lifespan)
@app.post("/text-classify", response_model=List[ChunkLabelResponse])
async def process_paragraph(request: ParagraphRequest):
if len(request.paragraph) > INFER_LENGTH:
raise HTTPException(status_code=400, detail=f"Max length: {INFER_LENGTH}")
chunks = split_chunk(request.paragraph, max_words=CHUNK_SIZE)
response = []
for chunk in chunks:
text = preprocess_text(chunk)
confidence, processed_label = infer(text, model, tokenizer, CLASSES)
response.append(ChunkLabelResponse(chunk=chunk, label=processed_label, confidence=confidence))
return response
import yaml
import re
def get_data_from_yaml(filename):
try:
with open(filename, 'r') as f:
data = yaml.safe_load(f)
except IOError:
raise IOError(f"Error opening file: {filename}")
return data
def split_chunk(text, max_words=200):
# Regular expression to match sentences and newlines
sentence_endings = re.compile(r'([.!?])\s+|\n')
# Split text into segments based on the regular expression
segments = sentence_endings.split(text)
paragraphs = []
current_paragraph = []
current_word_count = 0
for segment in segments:
if segment is not None:
words = segment.split()
word_count = len(words)
# Check if adding this segment would exceed the max_words limit
if current_word_count + word_count > max_words:
# If so, finalize the current paragraph and start a new one
paragraphs.append(' '.join(current_paragraph))
current_paragraph = words
current_word_count = word_count
else:
# Add the segment to the current paragraph
current_paragraph.extend(words)
current_word_count += word_count
# Add the last paragraph if any
if current_paragraph:
paragraphs.append(' '.join(current_paragraph))
return paragraphs
from utils import get_data_from_yaml
config = get_data_from_yaml("/src/config.yaml")
DB_SERVER = config.get("sqldb")["server"]
DB_TABLENAME = config.get("sqldb")["table"]
MINIO_SERVER = config.get("minio")["server"]
MINIO_DATA_LABELED = config.get("minio")["data_labeled"]
MINIO_MODEL_TRAINED = config.get("minio")["model_trained"]
\ No newline at end of file
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
from sqlalchemy.orm import sessionmaker
import os
from config import DB_SERVER
DATABASE_URL = f'postgresql+asyncpg://{os.getenv("POSTGRES_USER")}:{os.getenv("POSTGRES_PASSWORD")}@{DB_SERVER}/{os.getenv("POSTGRES_DB")}'
engine = create_async_engine(DATABASE_URL, echo=True)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine, class_=AsyncSession)
async def get_db():
async with SessionLocal() as session:
yield session
from sqlalchemy import Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from config import DB_TABLENAME
Base = declarative_base()
class LabeledData(Base):
__tablename__ = DB_TABLENAME
id = Column(Integer, primary_key=True, index=True)
paragraph = Column(String, nullable=False)
label = Column(String, nullable=False)
fastapi
sqlalchemy
asyncpg
psycopg2-binary
minio==7.2.7
pandas==2.2.1
from pydantic import BaseModel
class LabeledDataCreate(BaseModel):
paragraph: str
label: str
from fastapi import FastAPI, Depends, HTTPException
from contextlib import asynccontextmanager
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy import MetaData, Table, inspect
from minio import Minio
from minio.error import S3Error
import os
from models import Base, LabeledData
from schemas import LabeledDataCreate
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.orm import sessionmaker
import pandas as pd
from io import StringIO, BytesIO
from database import engine, get_db, SessionLocal
from config import MINIO_SERVER, MINIO_MODEL_TRAINED, MINIO_DATA_LABELED
from utils import check_bucket
minio_client = Minio(
endpoint=MINIO_SERVER,
access_key=os.getenv("MINIO_ROOT_USER"),
secret_key=os.getenv("MINIO_ROOT_PASSWORD"),
secure=False
)
bucket_names = [MINIO_DATA_LABELED, MINIO_MODEL_TRAINED]
@asynccontextmanager
async def lifespan(app: FastAPI):
check_bucket(minio_client=minio_client, bucket_names=bucket_names)
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
yield
app = FastAPI(lifespan=lifespan)
@app.post("/add-data", response_model=LabeledDataCreate)
async def create_labeled_data(data: LabeledDataCreate, db: AsyncSession = Depends(get_db)):
new_data = LabeledData(paragraph=data.paragraph, label=data.label)
try:
db.add(new_data)
await db.commit()
await db.refresh(new_data)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
return new_data
import yaml
def get_data_from_yaml(filename):
try:
with open(filename, 'r') as f:
data = yaml.safe_load(f)
except IOError:
raise IOError(f"Error opening file: {filename}")
return data
def check_bucket(minio_client, bucket_names):
# Check if bucket exists
for bucket_name in bucket_names:
found = minio_client.bucket_exists(bucket_name)
if not found:
# Create bucket
minio_client.make_bucket(bucket_name)
print(f"Bucket '{bucket_name}' created successfully.")
else:
print(f"Bucket '{bucket_name}' already exists.")
\ No newline at end of file
import torch.nn as nn
from transformers import AutoModel
class BERTClassifier(nn.Module):
def __init__(self, model_bert, n_classes):
super(BERTClassifier, self).__init__()
self.bert = AutoModel.from_pretrained(model_bert, local_files_only=True)
self.drop = nn.Dropout(p=0.3)
self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
nn.init.normal_(self.fc.weight, std=0.02)
nn.init.normal_(self.fc.bias, 0)
def forward(self, input_ids, attention_mask):
_, output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=False # Dropout will errors if without this
)
x = self.drop(output)
x = self.fc(x)
return x
import py_vncorenlp
from utils import get_data_from_yaml
config = get_data_from_yaml("/src/config.yaml")
CLASSES = config.get("classes")
VNCORENLP_DIR = config.get("vncorenlp")["save_dir"]
# Load the word and sentence segmentation component
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=['wseg'], save_dir=VNCORENLP_DIR)
def preprocess_row(row):
# Kiểm tra xem label có hợp lệ không
if row['label'] in CLASSES:
content = rdrsegmenter.word_segment(row['text'])
content = ''.join(content)
return content
else:
# Nếu label không hợp lệ, thay đổi text
return None
\ No newline at end of file
celery[redis]
py_vncorenlp==0.1.4
fastapi
uvicorn
numpy==1.24.3
transformers==4.39.2
pandas==2.2.1
minio==7.2.7
scikit-learn==1.4.1.post1
scipy==1.12.0
gensim==4.3.2
tensorboard
\ No newline at end of file
This diff is collapsed.
import numpy as np
import torch
import yaml
def get_data_from_yaml(filename):
try:
with open(filename, 'r') as f:
data = yaml.safe_load(f)
except IOError:
raise IOError(f"Error opening file: {filename}")
return data
def seed_everything(seed_value):
np.random.seed(seed_value)
torch.manual_seed(seed_value)
if torch.cuda.is_available():
print("Torch available")
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True
FROM pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=True \
PORT=9090
# Install dependencies
RUN apt-get update \
&& apt-get install -y git default-jre default-jdk
WORKDIR /src
RUN git clone https://github.com/vncorenlp/VnCoreNLP.git
RUN git clone https://huggingface.co/vinai/phobert-base/
COPY ./phobert-base/pytorch_model.bin /src/phobert-base/pytorch_model.bin
COPY ./server_train/requirements.txt /src/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY ./server_train/*.py /src/
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment