Commit 479f9de1 authored by Cao Duc Anh's avatar Cao Duc Anh

reupdate

parent 7a3e3b97
# change to list chars of your dataset or use default vietnamese chars
vocab: 'aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0123456789!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~ '
# cpu, cuda, cuda:0
device: cuda:0
seq_modeling: transformer
transformer:
d_model: 256
nhead: 8
num_encoder_layers: 6
num_decoder_layers: 6
dim_feedforward: 2048
max_seq_length: 1024
pos_dropout: 0.1
trans_dropout: 0.1
optimizer:
max_lr: 0.0003
pct_start: 0.1
trainer:
batch_size: 32
print_every: 200
valid_every: 4000
iters: 100000
# where to save our model for prediction
export: ./weights/transformerocr.pth
checkpoint: ./checkpoint/transformerocr_checkpoint.pth
log: ./train.log
# null to disable compuate accuracy, or change to number of sample to enable validiation while training
metrics: null
dataset:
# name of your dataset
name: data
# path to annotation and image
data_root: ./img/
train_annotation: annotation_train.txt
valid_annotation: annotation_val_small.txt
# resize image to 32 height, larger height will increase accuracy
image_height: 32
image_min_width: 32
image_max_width: 512
dataloader:
num_workers: 3
pin_memory: True
aug:
image_aug: true
masked_language_model: true
predictor:
# disable or enable beamsearch while prediction, use beamsearch will be slower
beamsearch: False
quiet: False
\ No newline at end of file
pretrain:
id_or_url: 1nTKlEog9YFK74kPyX0qLwCWi60_YHHk4
md5: efcabaa6d3adfca8e52bda2fd7d2ee04
cached: /tmp/tranformerorc.pth
# url or local path
weights: https://drive.google.com/uc?id=1nTKlEog9YFK74kPyX0qLwCWi60_YHHk4
backbone: vgg19_bn
cnn:
# pooling stride size
ss:
- [2, 2]
- [2, 2]
- [2, 1]
- [2, 1]
- [1, 1]
# pooling kernel size
ks:
- [2, 2]
- [2, 2]
- [2, 1]
- [2, 1]
- [1, 1]
# dim of ouput feature map
hidden: 256
seq_modeling: seq2seq
transformer:
encoder_hidden: 256
decoder_hidden: 256
img_channel: 256
decoder_embedded: 256
dropout: 0.1
optimizer:
max_lr: 0.001
pct_start: 0.1
\ No newline at end of file
# for training
pretrain: https://vocr.vn/data/vietocr/vgg_transformer.pth
# url or local path (predict)
weights: https://vocr.vn/data/vietocr/vgg_transformer.pth
backbone: vgg19_bn
cnn:
pretrained: True
# pooling stride size
ss:
- [2, 2]
- [2, 2]
- [2, 1]
- [2, 1]
- [1, 1]
# pooling kernel size
ks:
- [2, 2]
- [2, 2]
- [2, 1]
- [2, 1]
- [1, 1]
# dim of ouput feature map
hidden: 256
seq_modeling: transformer
transformer:
encoder_hidden: 256
decoder_hidden: 256
img_channel: 256
decoder_embedded: 256
dropout: 0.1
optimizer:
max_lr: 0.001
pct_start: 0.1
\ No newline at end of file
......@@ -214,7 +214,7 @@
" im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border\n",
" return im, r, (dw, dh)\n",
"\n",
"names = ['id', 'name', 'birth', 'sex', 'nation', 'origin', 'origin1', 'r1', 'r2']\n",
"names = [\"birth\", \"id\", \"name\", \"nation\", \"origin\", \"origin1\", \"r1\", \"r2\", \"sex\"]\n",
"colors = {name:[random.randint(0, 255) for _ in range(3)] for i,name in enumerate(names)}\n",
"\n",
"img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
......
......@@ -366,9 +366,9 @@
"metadata": {},
"outputs": [],
"source": [
"model_cnn = torch.jit.load(\"/home/anhcd/Projects/dac_vietocr/ConvertVietOcr2Onnx/weight/cnn.cpu.torchscript\").to(device)\n",
"model_encoder = torch.jit.load(\"/home/anhcd/Projects/dac_vietocr/ConvertVietOcr2Onnx/weight/encoder.cpu.torchscript\").to(device)\n",
"model_decoder = torch.jit.load(\"/home/anhcd/Projects/dac_vietocr/ConvertVietOcr2Onnx/weight/decoder.cpu.torchscript\").to(device)"
"model_cnn = torch.jit.load(\"/home/anhcd/Projects/dac_vietocr/ConvertVietOcr2Onnx/weight/cnn.torchscript\").to(device)\n",
"model_encoder = torch.jit.load(\"/home/anhcd/Projects/dac_vietocr/ConvertVietOcr2Onnx/weight/encoder.torchscript\").to(device)\n",
"model_decoder = torch.jit.load(\"/home/anhcd/Projects/dac_vietocr/ConvertVietOcr2Onnx/weight/decoder.torchscript\").to(device)"
]
},
{
......
{
"cells": [
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Errno 2] No such file or directory: 'yolov7'\n",
"/home/anhcd/Projects/dac_vietocr/yolov7\n"
]
}
],
"source": [
"%cd yolov7\n",
"import cv2\n",
"import torch\n",
"import numpy as np\n",
"import os\n",
"\n",
"from models.experimental import attempt_load\n",
"from utils.datasets import letterbox\n",
"from utils.general import non_max_suppression, scale_coords\n",
"from utils.plots import plot_one_box\n",
"\n",
"device = torch.device(\"cpu\")\n",
"half = device.type != \"cpu\"\n",
"\n",
"CATEGORIES = [\"bottom_left\",\"bottom_right\", \"top_left\", \"top_right\"]\n",
"COLORS = [(66, 135, 245), (194, 66, 245), (250, 52, 72), (111, 250, 52)]\n",
"\n",
"\n",
"def load_model(path):\n",
" model = attempt_load(path, map_location=device)\n",
"\n",
" if half:\n",
" model.half() \n",
" #model = torch.load(path, map_location=torch.device('cpu'))\n",
"\n",
" return model\n",
"\n",
"\n",
"def convert_img(img, device, half, new_size=416):\n",
" img = letterbox(img, new_shape=new_size)[0]\n",
" img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416\n",
" img = np.ascontiguousarray(img)\n",
" img = torch.from_numpy(img).to(device)\n",
"\n",
" img = img.half() if half else img.float()\n",
" img = img / 255.0\n",
"\n",
" if img.ndimension() == 3:\n",
" img = img.unsqueeze(0)\n",
"\n",
" return img\n",
"\n",
"\n",
"def predict_4_corners(img, model, resized_width=640):\n",
" if resized_width is not None:\n",
" orig_img = cv2.resize(img, width=640)\n",
" else:\n",
" orig_img = cv2.imread(img).copy()\n",
"\n",
" plot_img = orig_img.copy()\n",
" img = convert_img(orig_img, device, half, new_size=640)\n",
" _, _, new_height, new_width = img.size()\n",
"\n",
" preds = model(img)[0]\n",
" preds = non_max_suppression(preds, 0.2, 0.2)\n",
" result = np.array([], dtype=np.float32)\n",
"\n",
" # based on YOLOv7\n",
" for i, det in enumerate(preds): # detections per image\n",
" if det is not None:\n",
" # Rescale boxes from img_size to im0 sizes\n",
" det[:, :4] = scale_coords(img.size()[2:], det[:, :4], orig_img.shape).round()\n",
" result = det.type(torch.float32).cpu().detach().numpy()\n",
"\n",
" # for visualization only\n",
" for *xyxy, conf, cls in reversed(det):\n",
" label = \"%s %.2f\" % (CATEGORIES[int(cls)], conf)\n",
" plot_one_box(xyxy, plot_img, label=label, color=COLORS[int(cls)], line_thickness=2)\n",
"\n",
" return result, orig_img, plot_img\n",
"\n",
"\n",
"def filter_redundancy(result):\n",
" deleted_indexes = []\n",
"\n",
" for i, element in enumerate(result):\n",
" category = element[5]\n",
" max_class_confidence_score = max(result[np.where(result[:, 5] == category)][:, 4])\n",
"\n",
" if element[4] < max_class_confidence_score:\n",
" deleted_indexes.append(i)\n",
"\n",
" return np.delete(result, deleted_indexes, axis=0)\n",
"\n",
"\n",
"def get_center_point(coordinate_dict):\n",
" di = dict()\n",
"\n",
" for key in coordinate_dict.keys():\n",
" xmin, ymin, xmax, ymax = coordinate_dict[key]\n",
" x_center = (xmin + xmax) / 2\n",
" y_center = (ymin + ymax) / 2\n",
" di[key] = (x_center, y_center)\n",
"\n",
" return di\n",
"\n",
"def get_point(coordinate_dict):\n",
" di = dict()\n",
"\n",
" for key in coordinate_dict.keys():\n",
" xmin, ymin, xmax, ymax = coordinate_dict[key]\n",
" if key == 'bottom_left':\n",
" di[key] = (xmin, ymin + 50)\n",
" elif key == 'bottom_right':\n",
" di[key] = (xmax, ymin + 50)\n",
" elif key == 'top_left':\n",
" di[key] = (xmin, ymax - 25)\n",
" elif key == 'top_right':\n",
" di[key] = (xmax, ymax - 25)\n",
"\n",
" return di\n",
"\n",
"\n",
"def dictionary(result):\n",
" diction = {}\n",
" for det in result:\n",
" if int(det[5]) == 0:\n",
" a = 'bottom_left'\n",
" elif int(det[5]) == 1:\n",
" a = 'bottom_right'\n",
" elif int(det[5]) == 2:\n",
" a = 'top_left'\n",
" elif int(det[5]) == 3:\n",
" a = 'top_right' \n",
" label = [\n",
" det[0], \n",
" det[1], \n",
" det[2], \n",
" det[3], \n",
" ]\n",
" diction[a] = label\n",
"\n",
" return diction\n",
"\n",
"def find_miss_corner(coordinate_dict):\n",
" position_name = ['bottom_left', 'bottom_right','top_left', 'top_right']\n",
" position_index = np.array([0, 0, 0, 0])\n",
"\n",
" for name in coordinate_dict.keys():\n",
" if name in position_name:\n",
" position_index[position_name.index(name)] = 1\n",
"\n",
" index = np.argmin(position_index)\n",
"\n",
" return index\n",
"\n",
"\n",
"def calculate_missed_coord_corner(coordinate_dict):\n",
" thresh = 0\n",
"\n",
" index = find_miss_corner(coordinate_dict)\n",
"\n",
" # calculate missed corner coordinate\n",
" # case 1: missed corner is \"top_left\"\n",
" \n",
" if index == 0: # \"bottom_left\"\n",
" midpoint = np.add(coordinate_dict['top_left'], coordinate_dict['bottom_right']) / 2\n",
" y = 2 * midpoint[1] - coordinate_dict['top_right'][1] - thresh\n",
" x = 2 * midpoint[0] - coordinate_dict['top_right'][0] - thresh\n",
" coordinate_dict['bottom_left'] = (x, y)\n",
" elif index == 1: # \"bottom_right\"\n",
" midpoint = np.add(coordinate_dict['bottom_left'], coordinate_dict['top_right']) / 2\n",
" y = 2 * midpoint[1] - coordinate_dict['top_left'][1] - thresh\n",
" x = 2 * midpoint[0] - coordinate_dict['top_left'][0] - thresh\n",
" coordinate_dict['bottom_right'] = (x, y)\n",
" elif index == 2:\n",
" midpoint = np.add(coordinate_dict['top_right'], coordinate_dict['bottom_left']) / 2\n",
" y = 2 * midpoint[1] - coordinate_dict['bottom_right'][1] - thresh\n",
" x = 2 * midpoint[0] - coordinate_dict['bottom_right'][0] - thresh\n",
" coordinate_dict['top_left'] = (x, y)\n",
" elif index == 3: # \"top_right\"\n",
" midpoint = np.add(coordinate_dict['top_left'], coordinate_dict['bottom_right']) / 2\n",
" y = 2 * midpoint[1] - coordinate_dict['bottom_left'][1] - thresh\n",
" x = 2 * midpoint[0] - coordinate_dict['bottom_left'][0] - thresh\n",
" coordinate_dict['top_right'] = (x, y)\n",
" return coordinate_dict\n",
"\n",
"\n",
"def perspective_transform(image, source_points):\n",
" dest_points = np.float32([[0, 0], [500, 0], [500, 300], [0, 300]])\n",
" M = cv2.getPerspectiveTransform(source_points, dest_points)\n",
" dst = cv2.warpPerspective(image, M, (500, 300))\n",
"\n",
" return dst\n",
"\n",
"\n",
"\n",
"def warp_identity_card(img, model):\n",
" result, orig_img, plot_img = predict_4_corners(img, model, None)\n",
" if len(result) > 4:\n",
" result = filter_redundancy(result)\n",
"\n",
" result = dictionary(result) \n",
" result = get_point(result) \n",
"\n",
" if len(result) < 3:\n",
" print('Please try again')\n",
" return plot_img\n",
" \n",
" if len(result) == 3:\n",
" result = calculate_missed_coord_corner(result) \n",
"\n",
" bottom_right_point = result['bottom_right']\n",
" bottom_left_point = result['bottom_left']\n",
" top_left_point = result['top_left']\n",
" top_right_point = result['top_right']\n",
"\n",
" source_points = np.float32([top_left_point,top_right_point,bottom_right_point,bottom_left_point])\n",
" #img = cv2.imread(orig_img)\n",
" crop = perspective_transform(orig_img, source_points)\n",
"\n",
" return crop\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Xử lý xoay ảnh "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fusing layers... \n",
"RepConv.fuse_repvgg_block\n",
"RepConv.fuse_repvgg_block\n",
"RepConv.fuse_repvgg_block\n",
"IDetect.fuse\n",
"Please try again\n",
"Please try again\n",
"Please try again\n",
"Please try again\n",
"Please try again\n",
"Please try again\n",
"Please try again\n"
]
}
],
"source": [
"import glob\n",
"import os\n",
"import cv2\n",
"import numpy as np\n",
"\n",
"import os\n",
"img_paths = []\n",
"dir = \"/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao\"\n",
"for file in os.listdir(dir):\n",
" img_paths.append(os.path.join(dir, file))\n",
"\n",
"model = load_model(\"/home/anhcd/Projects/dac_vietocr/model/cccd_4_conner.pt\")\n",
"\n",
"for img_path in img_paths:\n",
" img_name = img_path.split(\"/\")[-1].split(\".\")[0]\n",
" \n",
" crop = warp_identity_card(img_path, model)\n",
" \n",
" cv2.imshow(f\"{img_name}\", crop) \n",
" cv2.waitKey(1000)\n",
" # cv2.imwrite(\"/home/anhcd/Projects/IEcccd/img/\" + img_name+ \".jpg\", crop)\n",
"\n",
"cv2.destroyAllWindows()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Xử lý các thông tin bên trong"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"categories_text = ['id', 'name', 'birth', 'sex', 'nation', 'origin', 'origin1', 'r1', 'r2']\n",
"\n",
"def predict_text(img, model, resized_width=640):\n",
" if resized_width is not None:\n",
" orig_img = cv2.resize(img, width=640)\n",
" else:\n",
" orig_img = cv2.imread(img).copy()\n",
"\n",
" plot_img = orig_img.copy()\n",
" img = convert_img(orig_img, device, half, new_size=640)\n",
" _, _, new_height, new_width = img.size()\n",
"\n",
" preds = model(img)[0]\n",
" print(preds)\n",
" preds = non_max_suppression(preds, 0.2, 0.2)\n",
" result = np.array([], dtype=np.float32)\n",
"\n",
" # based on YOLOv7\n",
" for i, det in enumerate(preds): # detections per image\n",
" if det is not None:\n",
" # Rescale boxes from img_size to im0 sizes\n",
" det[:, :4] = scale_coords(img.size()[2:], det[:, :4], orig_img.shape).round()\n",
" result = det.type(torch.float32).cpu().detach().numpy()\n",
"\n",
" # for visualization only\n",
" #for *xyxy, conf, cls in reversed(det):\n",
" # label = '%s %.2f' % (CATEGORIES[int(cls)], conf)\n",
" #plot_one_box(xyxy, plot_img, label=label, color=COLORS[int(cls)], line_thickness=2) '''\n",
"\n",
" return result, orig_img, plot_img \n",
"\n",
"def dictionary_text(result):\n",
" diction = {}\n",
" for det in result:\n",
" if int(det[5]) == 0:\n",
" a = 'birth'\n",
" elif int(det[5]) == 1:\n",
" a = 'id'\n",
" elif int(det[5]) == 2:\n",
" a = 'name'\n",
" elif int(det[5]) == 3:\n",
" a = 'nation' \n",
" elif int(det[5]) == 4:\n",
" a = 'origin' \n",
" elif int(det[5]) == 5:\n",
" a = 'origin1' \n",
" elif int(det[5]) == 6:\n",
" a = 'r1' \n",
" elif int(det[5]) == 7:\n",
" a = 'r2' \n",
" elif int(det[5]) == 8:\n",
" a = 'sex' \n",
" label = [\n",
" det[0], \n",
" det[1],\n",
" det[2], \n",
" det[3] \n",
" ]\n",
" diction[a] = label\n",
"\n",
" return diction\n",
"\n",
"def perspective_transform_text(image, source_points):\n",
" dest_points = np.float32([[0, 0], [800, 0], [800, 100], [0, 100]])\n",
" M = cv2.getPerspectiveTransform(source_points, dest_points)\n",
" dst = cv2.warpPerspective(image, M, (800, 100))\n",
"\n",
" return dst"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fusing layers... \n",
"RepConv.fuse_repvgg_block\n",
"RepConv.fuse_repvgg_block\n",
"RepConv.fuse_repvgg_block\n",
"IDetect.fuse\n",
"img_paths: ['/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1nation.jpg', '/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1birth.jpg', '/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1name.jpg', '/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1sex.jpg', '/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1origin.jpg', '/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1r2.jpg', '/home/anhcd/Projects/dac_vietocr/img_ocr/ducanhcao/ducanhcaocccd1id.jpg']\n",
"img_name: ducanhcaocccd1nation\n",
"tensor([[[7.26099e+00, 4.05999e+00, 2.87396e+01, ..., 1.99240e-02, 2.64919e-02, 2.19129e-01],\n",
" [1.37559e+01, 5.06019e+00, 3.11044e+01, ..., 3.57850e-02, 2.77619e-02, 1.63850e-01],\n",
" [2.29905e+01, 4.46362e+00, 4.87838e+01, ..., 5.33510e-02, 2.07194e-02, 7.84407e-02],\n",
" ...,\n",
" [5.50323e+02, 7.48115e+01, 1.05305e+02, ..., 3.95563e-02, 4.42920e-02, 1.24317e-01],\n",
" [5.80126e+02, 7.41523e+01, 1.00826e+02, ..., 4.02013e-02, 5.62249e-02, 9.85802e-02],\n",
" [6.13404e+02, 7.78276e+01, 1.52621e+02, ..., 4.46064e-02, 4.85629e-02, 1.02114e-01]]], grad_fn=<CatBackward0>)\n",
"result yolo7 []\n",
"Elapsed time: 0.09395909309387207 seconds\n",
"img_name: ducanhcaocccd1birth\n",
"tensor([[[6.96512e+00, 4.01770e+00, 3.03306e+01, ..., 2.01924e-02, 2.56553e-02, 1.28210e-01],\n",
" [1.41638e+01, 5.12023e+00, 3.38066e+01, ..., 4.23885e-02, 3.26655e-02, 9.99929e-02],\n",
" [2.26646e+01, 4.49237e+00, 4.83757e+01, ..., 7.18630e-02, 2.23456e-02, 8.54115e-02],\n",
" ...,\n",
" [5.46470e+02, 7.53209e+01, 1.09950e+02, ..., 3.55830e-02, 3.50057e-02, 1.43581e-01],\n",
" [5.77843e+02, 7.36600e+01, 1.26061e+02, ..., 3.77752e-02, 4.51986e-02, 1.06225e-01],\n",
" [6.11994e+02, 7.75922e+01, 1.71467e+02, ..., 3.88749e-02, 4.10803e-02, 1.09043e-01]]], grad_fn=<CatBackward0>)\n",
"result yolo7 []\n",
"Elapsed time: 0.07357263565063477 seconds\n",
"img_name: ducanhcaocccd1name\n",
"tensor([[[7.85997e+00, 4.04146e+00, 3.11653e+01, ..., 2.21473e-02, 2.85321e-02, 1.30392e-01],\n",
" [1.46489e+01, 5.06272e+00, 3.40587e+01, ..., 5.33185e-02, 3.22187e-02, 1.10082e-01],\n",
" [2.28639e+01, 4.56742e+00, 4.99890e+01, ..., 9.17780e-02, 2.37300e-02, 7.56081e-02],\n",
" ...,\n",
" [5.50199e+02, 7.26495e+01, 1.09896e+02, ..., 3.20059e-02, 5.48642e-02, 1.52614e-01],\n",
" [5.79119e+02, 7.11925e+01, 1.06547e+02, ..., 3.43380e-02, 6.31083e-02, 1.27570e-01],\n",
" [6.13034e+02, 7.54183e+01, 1.65852e+02, ..., 4.05585e-02, 5.14021e-02, 1.19271e-01]]], grad_fn=<CatBackward0>)\n",
"result yolo7 []\n",
"Elapsed time: 0.0675969123840332 seconds\n",
"img_name: ducanhcaocccd1sex\n",
"tensor([[[6.75139e+00, 3.79737e+00, 3.04912e+01, ..., 1.90046e-02, 2.42044e-02, 1.88708e-01],\n",
" [1.37134e+01, 5.20134e+00, 3.29218e+01, ..., 3.09879e-02, 2.90582e-02, 1.28494e-01],\n",
" [2.22934e+01, 4.87867e+00, 4.81174e+01, ..., 4.31133e-02, 2.05402e-02, 7.62491e-02],\n",
" ...,\n",
" [5.51570e+02, 7.44274e+01, 9.48155e+01, ..., 3.55333e-02, 4.69442e-02, 1.29273e-01],\n",
" [5.80613e+02, 7.39035e+01, 8.86110e+01, ..., 4.03416e-02, 5.58332e-02, 9.57269e-02],\n",
" [6.13693e+02, 7.79142e+01, 1.48807e+02, ..., 4.39701e-02, 4.92099e-02, 9.85647e-02]]], grad_fn=<CatBackward0>)\n",
"result yolo7 []\n",
"Elapsed time: 0.06969332695007324 seconds\n",
"img_name: ducanhcaocccd1origin\n",
"tensor([[[6.30326e+00, 3.94146e+00, 3.29426e+01, ..., 3.67895e-02, 3.45050e-02, 1.36935e-01],\n",
" [1.30867e+01, 5.66837e+00, 3.39089e+01, ..., 6.47475e-02, 4.40949e-02, 1.11277e-01],\n",
" [2.04908e+01, 5.58759e+00, 4.72886e+01, ..., 8.97226e-02, 2.74214e-02, 7.83768e-02],\n",
" ...,\n",
" [5.54586e+02, 7.71124e+01, 1.47577e+02, ..., 4.88534e-02, 4.75350e-02, 1.28162e-01],\n",
" [5.82739e+02, 7.57000e+01, 1.32784e+02, ..., 4.18695e-02, 6.51118e-02, 1.01845e-01],\n",
" [6.13369e+02, 7.69265e+01, 1.95680e+02, ..., 4.00506e-02, 6.35686e-02, 9.50838e-02]]], grad_fn=<CatBackward0>)\n",
"result yolo7 []\n",
"Elapsed time: 0.07462000846862793 seconds\n",
"img_name: ducanhcaocccd1r2\n",
"tensor([[[6.54024e+00, 3.99407e+00, 3.07478e+01, ..., 2.84999e-02, 3.44758e-02, 1.18740e-01],\n",
" [1.38926e+01, 4.84244e+00, 3.47917e+01, ..., 6.59324e-02, 3.85384e-02, 1.24240e-01],\n",
" [2.16096e+01, 4.78552e+00, 4.92324e+01, ..., 1.01309e-01, 2.57947e-02, 1.20259e-01],\n",
" ...,\n",
" [5.53538e+02, 7.74819e+01, 1.41711e+02, ..., 4.88862e-02, 3.97473e-02, 1.29176e-01],\n",
" [5.83266e+02, 7.85188e+01, 1.41294e+02, ..., 4.43791e-02, 5.97726e-02, 9.62701e-02],\n",
" [6.13904e+02, 8.01113e+01, 1.99567e+02, ..., 4.24263e-02, 6.37722e-02, 9.40810e-02]]], grad_fn=<CatBackward0>)\n",
"result yolo7 []\n",
"Elapsed time: 0.0670614242553711 seconds\n",
"img_name: ducanhcaocccd1id\n",
"tensor([[[7.15865e+00, 3.90274e+00, 3.01145e+01, ..., 1.87799e-02, 2.34303e-02, 1.72056e-01],\n",
" [1.43305e+01, 4.87431e+00, 3.33370e+01, ..., 3.62554e-02, 2.64340e-02, 1.28734e-01],\n",
" [2.30727e+01, 4.47840e+00, 4.85699e+01, ..., 5.64415e-02, 1.59234e-02, 9.82331e-02],\n",
" ...,\n",
" [5.54129e+02, 7.76490e+01, 9.91483e+01, ..., 4.22094e-02, 4.26301e-02, 1.37826e-01],\n",
" [5.81268e+02, 7.57913e+01, 9.21433e+01, ..., 3.96665e-02, 5.87279e-02, 1.06169e-01],\n",
" [6.14154e+02, 8.00202e+01, 1.56239e+02, ..., 4.18954e-02, 6.09098e-02, 1.01468e-01]]], grad_fn=<CatBackward0>)\n",
"result yolo7 []\n",
"Elapsed time: 0.07022809982299805 seconds\n"
]
}
],
"source": [
"import time \n",
"\n",
"model_text = load_model(\"/home/anhcd/Projects/dac_vietocr/model/crop_text.pt\")\n",
"dir_ocr = '/home/anhcd/Projects/dac_vietocr/img_ocr'\n",
"print(f'img_paths: {img_paths}')\n",
"for img_path in img_paths:\n",
" img_name = img_path.split(\"/\")[-1].split(\".\")[0]\n",
" print(f'img_name: {img_name}')\n",
"\n",
" start_time = time.time()\n",
" result, orig_img, plot_img = predict_text(img_path, model_text, None)\n",
" end_time = time.time()\n",
" # Print the elapsed time\n",
" print(\"Elapsed time:\", end_time - start_time, \"seconds\")\n",
"\n",
" # print(f'result: {result}')\n",
" result = dictionary_text(result)\n",
"\n",
" for key in result.keys():\n",
" xmin, ymin, xmax, ymax = result[key]\n",
"\n",
" bottom_right_point = (xmax, ymin)\n",
" bottom_left_point = (xmin, ymin)\n",
" top_left_point = (xmin, ymax)\n",
" top_right_point = (xmax, ymax)\n",
"\n",
" source_points = np.float32([bottom_left_point,bottom_right_point, top_right_point,top_left_point]) \n",
" crop = perspective_transform_text(orig_img, source_points)\n",
" cv2.imwrite(f\"{dir_ocr}/\" + img_name + key + \".jpg\", crop)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Main flow"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fusing layers... \n",
"RepConv.fuse_repvgg_block\n",
"RepConv.fuse_repvgg_block\n",
"RepConv.fuse_repvgg_block\n",
"IDetect.fuse\n",
"Fusing layers... \n",
"RepConv.fuse_repvgg_block\n",
"RepConv.fuse_repvgg_block\n",
"RepConv.fuse_repvgg_block\n",
"IDetect.fuse\n",
"Please try again\n",
"result yolo7 []\n",
"result yolo7 []\n",
"result yolo7 []\n",
"result yolo7 []\n",
"result yolo7 []\n",
"result yolo7 []\n"
]
}
],
"source": [
"import glob\n",
"import os\n",
"import cv2\n",
"import numpy as np\n",
"import os\n",
"\n",
"if __name__ == \"__main__\":\n",
" model = load_model(\"/home/anhcd/Projects/dac_ocr/model/cccd_4_conner.pt\")\n",
" model_text = load_model(\"/home/anhcd/Projects/dac_ocr/model/crop_text.pt\")\n",
"\n",
" img_paths = []\n",
" dir = \"/home/anhcd/Projects/dac_ocr/img/\"\n",
" dir_crop = \"/home/anhcd/Projects/dac_ocr/img_crop/\"\n",
" dir_ocr = \"/home/anhcd/Projects/dac_ocr/img_ocr/\"\n",
"\n",
" for file in os.listdir(dir):\n",
" img_paths.append(os.path.join(dir, file))\n",
"\n",
" for img_path in img_paths:\n",
" img_name = img_path.split(\"/\")[-1].split(\".\")[0]\n",
" \n",
" crop = warp_identity_card(img_path, model)\n",
" cv2.imwrite(dir_crop + img_name + \".jpg\", crop)\n",
"\n",
" imgcrop_paths = []\n",
" \n",
" for file in os.listdir(dir_crop):\n",
" imgcrop_paths.append(os.path.join(dir_crop, file))\n",
"\n",
" for img_path in imgcrop_paths:\n",
" img_name = img_path.split(\"/\")[-1].split(\".\")[0]\n",
" result, orig_img, plot_img = predict_text(img_path, model_text, None)\n",
" result = dictionary_text(result)\n",
" if result == {}:\n",
" continue\n",
"\n",
" for key in result.keys():\n",
" xmin, ymin, xmax, ymax = result[key]\n",
"\n",
" bottom_right_point = (xmax, ymin)\n",
" bottom_left_point = (xmin, ymin)\n",
" top_left_point = (xmin, ymax)\n",
" top_right_point = (xmax, ymax)\n",
"\n",
" source_points = np.float32([bottom_left_point,bottom_right_point, top_right_point,top_left_point]) \n",
" crop = perspective_transform_text(orig_img, source_points)\n",
" cv2.imwrite(f\"{dir_ocr}/\" + img_name + key + \".jpg\", crop)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "dacocr",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment