commit
9c6cac5223
34 changed files with 7202 additions and 0 deletions
@ -0,0 +1,135 @@ |
|||
|
|||
# Byte-compiled / optimized / DLL files |
|||
__pycache__/ |
|||
*.py[cod] |
|||
*$py.class |
|||
**/*.pyc |
|||
|
|||
# C extensions |
|||
*.so |
|||
|
|||
# Distribution / packaging |
|||
.Python |
|||
build/ |
|||
develop-eggs/ |
|||
dist/ |
|||
downloads/ |
|||
eggs/ |
|||
.eggs/ |
|||
lib/ |
|||
lib64/ |
|||
parts/ |
|||
sdist/ |
|||
var/ |
|||
wheels/ |
|||
*.egg-info/ |
|||
.installed.cfg |
|||
*.egg |
|||
MANIFEST |
|||
|
|||
# PyInstaller |
|||
# Usually these files are written by a python script from a template |
|||
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
|||
*.manifest |
|||
*.spec |
|||
|
|||
# Installer logs |
|||
pip-log.txt |
|||
pip-delete-this-directory.txt |
|||
|
|||
# Unit test / coverage reports |
|||
htmlcov/ |
|||
.tox/ |
|||
.coverage |
|||
.coverage.* |
|||
.cache |
|||
nosetests.xml |
|||
coverage.xml |
|||
*.cover |
|||
.hypothesis/ |
|||
.pytest_cache/ |
|||
|
|||
# Translations |
|||
*.mo |
|||
*.pot |
|||
|
|||
# Django stuff: |
|||
*.log |
|||
local_settings.py |
|||
db.sqlite3 |
|||
|
|||
# Flask stuff: |
|||
instance/ |
|||
.webassets-cache |
|||
|
|||
# Scrapy stuff: |
|||
.scrapy |
|||
|
|||
# Sphinx documentation |
|||
docs/en/_build |
|||
docs/zh_cn/_build |
|||
|
|||
# PyBuilder |
|||
target/ |
|||
|
|||
# Jupyter Notebook |
|||
.ipynb_checkpoints |
|||
|
|||
# pyenv |
|||
.python-version |
|||
|
|||
# celery beat schedule file |
|||
celerybeat-schedule |
|||
|
|||
# SageMath parsed files |
|||
*.sage.py |
|||
|
|||
# Environments |
|||
.env |
|||
.venv |
|||
env/ |
|||
venv/ |
|||
ENV/ |
|||
env.bak/ |
|||
venv.bak/ |
|||
|
|||
# Spyder project settings |
|||
.spyderproject |
|||
.spyproject |
|||
|
|||
# Rope project settings |
|||
.ropeproject |
|||
|
|||
# mkdocs documentation |
|||
/site |
|||
|
|||
# mypy |
|||
.mypy_cache/ |
|||
|
|||
# Pytorch |
|||
*.pth |
|||
|
|||
*.DS_Store |
|||
|
|||
# custom |
|||
/checkpoint |
|||
/models |
|||
/dataset |
|||
.vscode |
|||
.idea |
|||
*.pkl |
|||
*.pkl.json |
|||
*.log.json |
|||
*.npy |
|||
work_dirs/ |
|||
docs/**/topics/ |
|||
docs/**/papers/*.md |
|||
docs/**/datasets.md |
|||
docs/**/modelzoo.md |
|||
|
|||
!tests/data/**/*.pkl |
|||
!tests/data/**/*.pkl.json |
|||
!tests/data/**/*.log.json |
|||
!tests/data/**/*.pth |
|||
!tests/data/**/*.npy |
|||
temp/ |
@ -0,0 +1,115 @@ |
|||
# STCFormer: 3D Human Pose Estimation with Spatio-Temporal Criss-cross Attention [CVPR 2023] |
|||
This is the readme file for the code release of "3D Human Pose Estimation with Spatio-Temporal Criss-cross Attention" on PyTorch platform. |
|||
|
|||
Thank you for your interest, the code and checkpoints are being updated. |
|||
> [**3D Human Pose Estimation with Spatio-Temporal Criss-cross Attention**](https://openaccess.thecvf.com/content/CVPR2023/papers/Tang_3D_Human_Pose_Estimation_With_Spatio-Temporal_Criss-Cross_Attention_CVPR_2023_paper.pdf), |
|||
> Zhenhua Tang, Zhaofan Qiu, Yanbin Hao, Richang Hong, And Ting Yao, |
|||
> *In IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2023* |
|||
|
|||
## Poster: |
|||
<p align="center"><img src="poster_9M.png" width="100%" alt="" /></p> |
|||
|
|||
## Demo: |
|||
 |
|||
|
|||
## The released codes include: |
|||
checkpoint/: the folder for model weights of STCFormer. |
|||
dataset/: the folder for data loader. |
|||
common/: the folder for basic functions. |
|||
model/: the folder for STCFormer network. |
|||
run_stc.py: the python code for STCFormer networks training. |
|||
|
|||
|
|||
## Dependencies |
|||
Make sure you have the following dependencies installed: |
|||
* PyTorch >= 0.4.0 |
|||
* NumPy |
|||
* Matplotlib=3.1.0 |
|||
|
|||
## Dataset |
|||
|
|||
Our model is evaluated on [Human3.6M](http://vision.imar.ro/human3.6m) and [MPI-INF-3DHP](https://vcai.mpi-inf.mpg.de/3dhp-dataset/) datasets. |
|||
|
|||
### Human3.6M |
|||
We set up the Human3.6M dataset in the same way as [VideoPose3D](https://github.com/facebookresearch/VideoPose3D/blob/master/DATASETS.md). |
|||
### MPI-INF-3DHP |
|||
We set up the MPI-INF-3DHP dataset in the same way as [P-STMO](https://github.com/paTRICK-swk/P-STMO). |
|||
|
|||
|
|||
## Training from scratch |
|||
### Human 3.6M |
|||
For the training stage, please run: |
|||
```bash |
|||
python run_stc.py -f 27 -b 128 --train 1 --layers 6 -s 3 |
|||
``` |
|||
For the testing stage, please run: |
|||
```bash |
|||
python run_stc.py -f 27 -b 128 --train 0 --layers 6 -s 1 --reload 1 --previous_dir ./checkpoint/your_best_model.pth |
|||
``` |
|||
|
|||
|
|||
## Evaluating our models |
|||
|
|||
You can download our pre-trained models from [Google Drive](https://drive.google.com/drive/folders/1waaQ1Yj-HfbNahnCN8AWCjMCGzyhZJF7?usp=sharing) or [Baidu Disk](https://pan.baidu.com/s/1axVQNHxdZFH4Eiqiy2EvYQ) (extraction code:STC1). Put them in the ./checkpoint directory. |
|||
|
|||
### Human 3.6M |
|||
|
|||
To evaluate our STCFormer model on the 2D keypoints obtained by CPN, please run: |
|||
```bash |
|||
python run_stc.py -f 27 -b 128 --train 0 --layers 6 -s 1 -k 'cpn_ft_h36m_dbb' --reload 1 --previous_dir ./checkpoint/model_27_STCFormer/no_refine_6_4406.pth |
|||
``` |
|||
```bash |
|||
python run_stc.py -f 81 -b 128 --train 0 --layers 6 -s 1 -k 'cpn_ft_h36m_dbb' --reload 1 --previous_dir ./checkpoint/model_81_STCFormer/no_refine_6_4172.pth |
|||
``` |
|||
Different models use different configurations as follows. |
|||
|
|||
| Frames | P1 (mm) | P2 (mm) | |
|||
| ------------- | ------------- | ------------- | |
|||
| 27 | 44.08 | 34.76 | |
|||
| 81 | 41.72 | 32.94 | |
|||
|
|||
Since the model with 243-frames input is proprietary and stored exclusively on the company server, it is unavailable due to copyright restrictions. If you require results based on that specific model, I recommend training a similar model internally to achieve the desired outcome. |
|||
|
|||
### MPI-INF-3DHP |
|||
The pre-trained models and codes for STCFormer are currently undergoing updates. In the meantime, you can run this code, which is based on an earlier version and may lack organization, to observe the results for 81 frames. |
|||
|
|||
```bash |
|||
python run_3dhp_stc.py --train 0 --frames 81 -b 128 -s 1 --reload 1 --previous_dir ./checkpoint/model_81_STMO/no_refine_8_2310.pth |
|||
``` |
|||
|
|||
|
|||
### In the Wild Video |
|||
Accroding MHFormer, make sure to download the YOLOv3 and HRNet pretrained models [here](https://drive.google.com/drive/folders/1_ENAMOsPM7FXmdYRbkwbFHgzQq_B_NQA) and put it in the './demo/lib/checkpoint' directory firstly. Then, you need to put your in-the-wild videos in the './demo/video' directory. |
|||
|
|||
You can modify the 'get_pose3D' function in the 'vis.py' script according to your needs, including the checkpoint and model parameters, and then execute the following command: |
|||
|
|||
```bash |
|||
python demo/vis.py --video sample_video.mp4 |
|||
``` |
|||
|
|||
|
|||
|
|||
|
|||
## Citation |
|||
|
|||
If you find this repo useful, please consider citing our paper: |
|||
|
|||
@inproceedings{tang20233d,\ |
|||
title={3D Human Pose Estimation With Spatio-Temporal Criss-Cross Attention},\ |
|||
author={Tang, Zhenhua and Qiu, Zhaofan and Hao, Yanbin and Hong, Richang and Yao, Ting},\ |
|||
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},\ |
|||
pages={4790--4799},\ |
|||
year={2023} |
|||
} |
|||
|
|||
## Acknowledgement |
|||
Our code refers to the following repositories. |
|||
|
|||
[VideoPose3D](https://github.com/facebookresearch/VideoPose3D) \ |
|||
[StridedTransformer-Pose3D](https://github.com/Vegetebird/StridedTransformer-Pose3D) \ |
|||
[P-STMO](https://github.com/paTRICK-swk/P-STMO/tree/main) \ |
|||
[MHFormer](https://github.com/Vegetebird/MHFormer) \ |
|||
[MixSTE](https://github.com/JinluZhang1126/MixSTE) \ |
|||
[FTCM](https://github.com/zhenhuat/FTCM) |
|||
|
|||
We thank the authors for releasing their codes. |
@ -0,0 +1,92 @@ |
|||
import sys |
|||
import numpy as np |
|||
import torch |
|||
|
|||
def normalize_screen_coordinates(X, w, h): |
|||
assert X.shape[-1] == 2 |
|||
return X / w * 2 - [1, h / w] |
|||
|
|||
|
|||
def image_coordinates(X, w, h): |
|||
assert X.shape[-1] == 2 |
|||
|
|||
# Reverse camera frame normalization |
|||
return (X + [1, h / w]) * w / 2 |
|||
|
|||
def world_to_camera(X, R, t): |
|||
Rt = wrap(qinverse, R) |
|||
return wrap(qrot, np.tile(Rt, (*X.shape[:-1], 1)), X - t) |
|||
|
|||
def camera_to_world(X, R, t): |
|||
return wrap(qrot, np.tile(R, (*X.shape[:-1], 1)), X) + t |
|||
|
|||
def wrap(func, *args, unsqueeze=False): |
|||
args = list(args) |
|||
for i, arg in enumerate(args): |
|||
if type(arg) == np.ndarray: |
|||
args[i] = torch.from_numpy(arg) |
|||
if unsqueeze: |
|||
args[i] = args[i].unsqueeze(0) |
|||
|
|||
result = func(*args) |
|||
|
|||
if isinstance(result, tuple): |
|||
result = list(result) |
|||
for i, res in enumerate(result): |
|||
if type(res) == torch.Tensor: |
|||
if unsqueeze: |
|||
res = res.squeeze(0) |
|||
result[i] = res.numpy() |
|||
return tuple(result) |
|||
elif type(result) == torch.Tensor: |
|||
if unsqueeze: |
|||
result = result.squeeze(0) |
|||
return result.numpy() |
|||
else: |
|||
return result |
|||
|
|||
def qrot(q, v): |
|||
assert q.shape[-1] == 4 |
|||
assert v.shape[-1] == 3 |
|||
assert q.shape[:-1] == v.shape[:-1] |
|||
|
|||
qvec = q[..., 1:] |
|||
uv = torch.cross(qvec, v, dim=len(q.shape) - 1) |
|||
uuv = torch.cross(qvec, uv, dim=len(q.shape) - 1) |
|||
return (v + 2 * (q[..., :1] * uv + uuv)) |
|||
|
|||
|
|||
def qinverse(q, inplace=False): |
|||
if inplace: |
|||
q[..., 1:] *= -1 |
|||
return q |
|||
else: |
|||
w = q[..., :1] |
|||
xyz = q[..., 1:] |
|||
return torch.cat((w, -xyz), dim=len(q.shape) - 1) |
|||
|
|||
|
|||
def get_uvd2xyz(uvd, gt_3D, cam): |
|||
N, T, V,_ = uvd.size() |
|||
|
|||
dec_out_all = uvd.view(-1, T, V, 3).clone() |
|||
root = gt_3D[:, :, 0, :].unsqueeze(-2).repeat(1, 1, V, 1).clone() |
|||
enc_in_all = uvd[:, :, :, :2].view(-1, T, V, 2).clone() |
|||
|
|||
cam_f_all = cam[..., :2].view(-1,1,1,2).repeat(1,T,V,1) |
|||
cam_c_all = cam[..., 2:4].view(-1,1,1,2).repeat(1,T,V,1) |
|||
|
|||
z_global = dec_out_all[:, :, :, 2] |
|||
z_global[:, :, 0] = root[:, :, 0, 2] |
|||
z_global[:, :, 1:] = dec_out_all[:, :, 1:, 2] + root[:, :, 1:, 2] |
|||
z_global = z_global.unsqueeze(-1) |
|||
|
|||
uv = enc_in_all - cam_c_all |
|||
xy = uv * z_global.repeat(1, 1, 1, 2) / cam_f_all |
|||
xyz_global = torch.cat((xy, z_global), -1) |
|||
xyz_offset = (xyz_global - xyz_global[:, :, 0, :].unsqueeze(-2).repeat(1, 1, V, 1)) |
|||
|
|||
return xyz_offset |
|||
|
|||
|
|||
|
@ -0,0 +1,66 @@ |
|||
import os |
|||
import numpy as np |
|||
from common.utils_3dhp import * |
|||
|
|||
import scipy.io as scio |
|||
|
|||
data_path=r'F:\mpi_inf_3dhp\data' |
|||
cam_set = [0, 1, 2, 4, 5, 6, 7, 8] |
|||
# joint_set = [8, 6, 15, 16, 17, 10, 11, 12, 24, 25, 26, 19, 20, 21, 5, 4, 7] |
|||
joint_set = [7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 4, 3, 6] |
|||
|
|||
dic_seq={} |
|||
|
|||
for root, dirs, files in os.walk(data_path): |
|||
|
|||
for file in files: |
|||
if file.endswith("mat"): |
|||
|
|||
path = root.split("\\") |
|||
subject = path[-2][1] |
|||
seq = path[-1][3] |
|||
print("loading %s %s..."%(path[-2],path[-1])) |
|||
|
|||
temp = mpii_get_sequence_info(subject, seq) |
|||
|
|||
frames = temp[0] |
|||
fps = temp[1] |
|||
|
|||
data = scio.loadmat(os.path.join(root, file)) |
|||
cameras = data['cameras'][0] |
|||
for cam_idx in range(len(cameras)): |
|||
assert cameras[cam_idx] == cam_idx |
|||
|
|||
data_2d = data['annot2'][cam_set] |
|||
data_3d = data['univ_annot3'][cam_set] |
|||
|
|||
dic_cam = {} |
|||
a = len(data_2d) |
|||
for cam_idx in range(len(data_2d)): |
|||
data_2d_cam = data_2d[cam_idx][0] |
|||
data_3d_cam = data_3d[cam_idx][0] |
|||
|
|||
data_2d_cam = data_2d_cam.reshape(data_2d_cam.shape[0], 28,2) |
|||
data_3d_cam = data_3d_cam.reshape(data_3d_cam.shape[0], 28,3) |
|||
|
|||
data_2d_select = data_2d_cam[:frames, joint_set] |
|||
data_3d_select = data_3d_cam[:frames, joint_set] |
|||
|
|||
dic_data = {"data_2d":data_2d_select,"data_3d":data_3d_select} |
|||
|
|||
dic_cam.update({str(cam_set[cam_idx]):dic_data}) |
|||
|
|||
|
|||
dic_seq.update({path[-2]+" "+path[-1]:[dic_cam, fps]}) |
|||
|
|||
|
|||
np.savez_compressed('data_train_3dhp', data=dic_seq) |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,51 @@ |
|||
import os |
|||
import numpy as np |
|||
from common.utils_3dhp import * |
|||
|
|||
import h5py |
|||
|
|||
import scipy.io as scio |
|||
|
|||
data_path=r'F:\mpi_inf_3dhp\mpi_inf_3dhp_test_set' |
|||
cam_set = [0, 1, 2, 4, 5, 6, 7, 8] |
|||
# joint_set = [8, 6, 15, 16, 17, 10, 11, 12, 24, 25, 26, 19, 20, 21, 5, 4, 7] |
|||
joint_set = [7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 4, 3, 6] |
|||
|
|||
dic_seq={} |
|||
|
|||
for root, dirs, files in os.walk(data_path): |
|||
|
|||
for file in files: |
|||
if file.endswith("mat"): |
|||
|
|||
path = root.split("\\") |
|||
subject = path[-1][2] |
|||
print("loading %s..."%path[-1]) |
|||
|
|||
# temp = mpii_get_sequence_info(subject, seq) |
|||
# |
|||
# frames = temp[0] |
|||
# fps = temp[1] |
|||
|
|||
data = h5py.File(os.path.join(root, file)) |
|||
|
|||
valid_frame = np.squeeze(data['valid_frame'].value) |
|||
|
|||
data_2d = np.squeeze(data['annot2'].value) |
|||
data_3d = np.squeeze(data['univ_annot3'].value) |
|||
|
|||
dic_data = {"data_2d":data_2d,"data_3d":data_3d, "valid":valid_frame} |
|||
|
|||
dic_seq.update({path[-1]:dic_data}) |
|||
|
|||
|
|||
np.savez_compressed('data_test_3dhp', data=dic_seq) |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,199 @@ |
|||
import numpy as np |
|||
|
|||
|
|||
class ChunkedGenerator: |
|||
def __init__(self, batch_size, cameras, poses_3d, poses_2d, valid_frame, |
|||
chunk_length=1, pad=0, causal_shift=0, |
|||
shuffle=False, random_seed=1234, |
|||
augment=False, reverse_aug= False,kps_left=None, kps_right=None, joints_left=None, joints_right=None, |
|||
endless=False, out_all = False, MAE=False, train=True): |
|||
assert poses_3d is None or len(poses_3d) == len(poses_2d), (len(poses_3d), len(poses_2d)) |
|||
assert cameras is None or len(cameras) == len(poses_2d) |
|||
|
|||
pairs = [] |
|||
self.saved_index = {} |
|||
start_index = 0 |
|||
|
|||
if train == True: |
|||
for key in poses_2d.keys(): |
|||
assert poses_3d is None or poses_2d[key].shape[0] == poses_3d[key].shape[0] |
|||
n_chunks = (poses_2d[key].shape[0] + chunk_length - 1) // chunk_length |
|||
offset = (n_chunks * chunk_length - poses_2d[key].shape[0]) // 2 |
|||
bounds = np.arange(n_chunks + 1) * chunk_length - offset |
|||
augment_vector = np.full(len(bounds - 1), False, dtype=bool) |
|||
reverse_augment_vector = np.full(len(bounds - 1), False, dtype=bool) |
|||
keys = np.tile(np.array(key).reshape([1,3]),(len(bounds - 1),1)) |
|||
pairs += list(zip(keys, bounds[:-1], bounds[1:], augment_vector,reverse_augment_vector)) |
|||
if reverse_aug: |
|||
pairs += list(zip(keys, bounds[:-1], bounds[1:], augment_vector, ~reverse_augment_vector)) |
|||
if augment: |
|||
if reverse_aug: |
|||
pairs += list(zip(keys, bounds[:-1], bounds[1:], ~augment_vector,~reverse_augment_vector)) |
|||
else: |
|||
pairs += list(zip(keys, bounds[:-1], bounds[1:], ~augment_vector, reverse_augment_vector)) |
|||
|
|||
end_index = start_index + poses_3d[key].shape[0] |
|||
self.saved_index[key] = [start_index,end_index] |
|||
start_index = start_index + poses_3d[key].shape[0] |
|||
else: |
|||
for key in poses_2d.keys(): |
|||
assert poses_3d is None or poses_2d[key].shape[0] == poses_3d[key].shape[0] |
|||
n_chunks = (poses_2d[key].shape[0] + chunk_length - 1) // chunk_length |
|||
offset = (n_chunks * chunk_length - poses_2d[key].shape[0]) // 2 |
|||
bounds = np.arange(n_chunks) * chunk_length - offset |
|||
bounds_low = bounds[valid_frame[key].astype(bool)] |
|||
bounds_high = bounds[valid_frame[key].astype(bool)] + np.ones(bounds_low.shape[0],dtype=int) |
|||
|
|||
augment_vector = np.full(len(bounds_low), False, dtype=bool) |
|||
reverse_augment_vector = np.full(len(bounds_low), False, dtype=bool) |
|||
keys = np.tile(np.array(key).reshape([1, 1]), (len(bounds_low), 1)) |
|||
pairs += list(zip(keys, bounds_low, bounds_high, augment_vector, reverse_augment_vector)) |
|||
if reverse_aug: |
|||
pairs += list(zip(keys, bounds_low, bounds_high, augment_vector, ~reverse_augment_vector)) |
|||
if augment: |
|||
if reverse_aug: |
|||
pairs += list(zip(keys, bounds_low, bounds_high, ~augment_vector, ~reverse_augment_vector)) |
|||
else: |
|||
pairs += list(zip(keys, bounds_low, bounds_high, ~augment_vector, reverse_augment_vector)) |
|||
|
|||
end_index = start_index + poses_3d[key].shape[0] |
|||
self.saved_index[key] = [start_index, end_index] |
|||
start_index = start_index + poses_3d[key].shape[0] |
|||
|
|||
|
|||
if cameras is not None: |
|||
self.batch_cam = np.empty((batch_size, cameras[key].shape[-1])) |
|||
|
|||
if poses_3d is not None: |
|||
self.batch_3d = np.empty((batch_size, chunk_length, poses_3d[key].shape[-2], poses_3d[key].shape[-1])) |
|||
self.batch_2d = np.empty((batch_size, chunk_length + 2 * pad, poses_2d[key].shape[-2], poses_2d[key].shape[-1])) |
|||
|
|||
self.num_batches = (len(pairs) + batch_size - 1) // batch_size |
|||
self.batch_size = batch_size |
|||
self.random = np.random.RandomState(random_seed) |
|||
self.pairs = pairs |
|||
self.shuffle = shuffle |
|||
self.pad = pad |
|||
self.causal_shift = causal_shift |
|||
self.endless = endless |
|||
self.state = None |
|||
|
|||
self.cameras = cameras |
|||
if cameras is not None: |
|||
self.cameras = cameras |
|||
self.poses_3d = poses_3d |
|||
self.poses_2d = poses_2d |
|||
|
|||
self.augment = augment |
|||
self.kps_left = kps_left |
|||
self.kps_right = kps_right |
|||
self.joints_left = joints_left |
|||
self.joints_right = joints_right |
|||
self.out_all = out_all |
|||
self.MAE=MAE |
|||
|
|||
self.valid_frame = valid_frame |
|||
self.train=train |
|||
|
|||
def num_frames(self): |
|||
return self.num_batches * self.batch_size |
|||
|
|||
def random_state(self): |
|||
return self.random |
|||
|
|||
def set_random_state(self, random): |
|||
self.random = random |
|||
|
|||
def augment_enabled(self): |
|||
return self.augment |
|||
|
|||
def next_pairs(self): |
|||
if self.state is None: |
|||
if self.shuffle: |
|||
pairs = self.random.permutation(self.pairs) |
|||
else: |
|||
pairs = self.pairs |
|||
return 0, pairs |
|||
else: |
|||
return self.state |
|||
|
|||
def get_batch(self, seq_i, start_3d, end_3d, flip, reverse): |
|||
if self.train==True: |
|||
subject,seq,cam_index = seq_i |
|||
seq_name = (subject,seq,cam_index) |
|||
else: |
|||
seq_name = seq_i[0] |
|||
start_2d = start_3d - self.pad - self.causal_shift |
|||
end_2d = end_3d + self.pad - self.causal_shift |
|||
|
|||
seq_2d = self.poses_2d[seq_name].copy() |
|||
low_2d = max(start_2d, 0) |
|||
high_2d = min(end_2d, seq_2d.shape[0]) |
|||
pad_left_2d = low_2d - start_2d |
|||
pad_right_2d = end_2d - high_2d |
|||
if pad_left_2d != 0 or pad_right_2d != 0: |
|||
self.batch_2d = np.pad(seq_2d[low_2d:high_2d], ((pad_left_2d, pad_right_2d), (0, 0), (0, 0)), 'edge') |
|||
else: |
|||
self.batch_2d = seq_2d[low_2d:high_2d] |
|||
|
|||
if flip: |
|||
self.batch_2d[ :, :, 0] *= -1 |
|||
self.batch_2d[ :, self.kps_left + self.kps_right] = self.batch_2d[ :, |
|||
self.kps_right + self.kps_left] |
|||
if reverse: |
|||
self.batch_2d = self.batch_2d[::-1].copy() |
|||
|
|||
if not self.MAE: |
|||
if self.poses_3d is not None: |
|||
seq_3d = self.poses_3d[seq_name].copy() |
|||
if self.out_all: |
|||
low_3d = low_2d |
|||
high_3d = high_2d |
|||
pad_left_3d = pad_left_2d |
|||
pad_right_3d = pad_right_2d |
|||
else: |
|||
low_3d = max(start_3d, 0) |
|||
high_3d = min(end_3d, seq_3d.shape[0]) |
|||
pad_left_3d = low_3d - start_3d |
|||
pad_right_3d = end_3d - high_3d |
|||
if pad_left_3d != 0 or pad_right_3d != 0: |
|||
self.batch_3d = np.pad(seq_3d[low_3d:high_3d], |
|||
((pad_left_3d, pad_right_3d), (0, 0), (0, 0)), 'edge') |
|||
else: |
|||
self.batch_3d = seq_3d[low_3d:high_3d] |
|||
|
|||
if flip: |
|||
self.batch_3d[ :, :, 0] *= -1 |
|||
self.batch_3d[ :, self.joints_left + self.joints_right] = \ |
|||
self.batch_3d[ :, self.joints_right + self.joints_left] |
|||
if reverse: |
|||
self.batch_3d = self.batch_3d[::-1].copy() |
|||
|
|||
if self.cameras is not None: |
|||
self.batch_cam = self.cameras[seq_name].copy() |
|||
if flip: |
|||
self.batch_cam[ 2] *= -1 |
|||
self.batch_cam[ 7] *= -1 |
|||
if self.train == True: |
|||
if self.MAE: |
|||
return np.zeros(9), self.batch_2d.copy(), seq, subject, int(cam_index) |
|||
if self.poses_3d is None and self.cameras is None: |
|||
return None, None, self.batch_2d.copy(), seq, subject, int(cam_index) |
|||
elif self.poses_3d is not None and self.cameras is None: |
|||
return np.zeros(9), self.batch_3d.copy(), self.batch_2d.copy(),seq, subject, int(cam_index) |
|||
elif self.poses_3d is None: |
|||
return self.batch_cam, None, self.batch_2d.copy(),seq, subject, int(cam_index) |
|||
else: |
|||
return self.batch_cam, self.batch_3d.copy(), self.batch_2d.copy(),seq, subject, int(cam_index) |
|||
else: |
|||
if self.MAE: |
|||
return np.zeros(9), self.batch_2d.copy(), seq_name, None, None |
|||
else: |
|||
return np.zeros(9), self.batch_3d.copy(), self.batch_2d.copy(), seq_name, None, None |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,187 @@ |
|||
import numpy as np |
|||
|
|||
|
|||
class ChunkedGenerator: |
|||
def __init__(self, batch_size, cameras, poses_3d, poses_2d, |
|||
chunk_length=1, pad=0, causal_shift=0, |
|||
shuffle=False, random_seed=1234, |
|||
augment=False, reverse_aug= False,kps_left=None, kps_right=None, joints_left=None, joints_right=None, |
|||
endless=False, out_all = False, MAE=False, tds=1): |
|||
assert poses_3d is None or len(poses_3d) == len(poses_2d), (len(poses_3d), len(poses_2d)) |
|||
assert cameras is None or len(cameras) == len(poses_2d) |
|||
|
|||
pairs = [] |
|||
self.saved_index = {} |
|||
start_index = 0 |
|||
|
|||
for key in poses_2d.keys(): |
|||
assert poses_3d is None or poses_3d[key].shape[0] == poses_3d[key].shape[0] |
|||
n_chunks = (poses_2d[key].shape[0] + chunk_length - 1) // chunk_length |
|||
offset = (n_chunks * chunk_length - poses_2d[key].shape[0]) // 2 |
|||
bounds = np.arange(n_chunks + 1) * chunk_length - offset |
|||
augment_vector = np.full(len(bounds - 1), False, dtype=bool) |
|||
reverse_augment_vector = np.full(len(bounds - 1), False, dtype=bool) |
|||
keys = np.tile(np.array(key).reshape([1,3]),(len(bounds - 1),1)) |
|||
pairs += list(zip(keys, bounds[:-1], bounds[1:], augment_vector,reverse_augment_vector)) |
|||
if reverse_aug: |
|||
pairs += list(zip(keys, bounds[:-1], bounds[1:], augment_vector, ~reverse_augment_vector)) |
|||
if augment: |
|||
if reverse_aug: |
|||
pairs += list(zip(keys, bounds[:-1], bounds[1:], ~augment_vector,~reverse_augment_vector)) |
|||
else: |
|||
pairs += list(zip(keys, bounds[:-1], bounds[1:], ~augment_vector, reverse_augment_vector)) |
|||
|
|||
end_index = start_index + poses_3d[key].shape[0] |
|||
self.saved_index[key] = [start_index,end_index] |
|||
start_index = start_index + poses_3d[key].shape[0] |
|||
|
|||
|
|||
if cameras is not None: |
|||
self.batch_cam = np.empty((batch_size, cameras[key].shape[-1])) |
|||
|
|||
if poses_3d is not None: |
|||
self.batch_3d = np.empty((batch_size, chunk_length, poses_3d[key].shape[-2], poses_3d[key].shape[-1])) |
|||
self.batch_2d = np.empty((batch_size, chunk_length + 2 * pad, poses_2d[key].shape[-2], poses_2d[key].shape[-1])) |
|||
|
|||
self.num_batches = (len(pairs) + batch_size - 1) // batch_size |
|||
self.batch_size = batch_size |
|||
self.random = np.random.RandomState(random_seed) |
|||
self.pairs = pairs |
|||
self.shuffle = shuffle |
|||
self.pad = pad |
|||
self.causal_shift = causal_shift |
|||
self.endless = endless |
|||
self.state = None |
|||
|
|||
self.cameras = cameras |
|||
if cameras is not None: |
|||
self.cameras = cameras |
|||
self.poses_3d = poses_3d |
|||
self.poses_2d = poses_2d |
|||
|
|||
self.augment = augment |
|||
self.kps_left = kps_left |
|||
self.kps_right = kps_right |
|||
self.joints_left = joints_left |
|||
self.joints_right = joints_right |
|||
self.out_all = out_all |
|||
self.MAE = MAE |
|||
self.tds = tds |
|||
self.chunk_length = chunk_length |
|||
|
|||
def num_frames(self): |
|||
return self.num_batches * self.batch_size |
|||
|
|||
def random_state(self): |
|||
return self.random |
|||
|
|||
def set_random_state(self, random): |
|||
self.random = random |
|||
|
|||
def augment_enabled(self): |
|||
return self.augment |
|||
|
|||
def next_pairs(self): |
|||
if self.state is None: |
|||
if self.shuffle: |
|||
pairs = self.random.permutation(self.pairs) |
|||
else: |
|||
pairs = self.pairs |
|||
return 0, pairs |
|||
else: |
|||
return self.state |
|||
|
|||
def get_batch(self, seq_i, start_3d, end_3d, flip, reverse): |
|||
subject,action,cam_index = seq_i |
|||
seq_name = (subject,action,int(cam_index)) |
|||
if self.chunk_length == 1: |
|||
start_2d = start_3d - self.pad * self.tds - self.causal_shift |
|||
end_2d = end_3d + self.pad * self.tds - self.causal_shift |
|||
else: |
|||
mid = end_3d - self.pad |
|||
start_2d = mid - self.pad * self.tds - self.causal_shift-1 |
|||
end_2d = mid + self.pad * self.tds - self.causal_shift |
|||
|
|||
seq_2d = self.poses_2d[seq_name].copy() |
|||
low_2d = max(start_2d, 0) |
|||
high_2d = min(end_2d, seq_2d.shape[0]) |
|||
pad_left_2d = low_2d - start_2d |
|||
pad_right_2d = end_2d - high_2d |
|||
if pad_left_2d != 0: |
|||
data_pad = np.repeat(seq_2d[0:1],pad_left_2d,axis=0) |
|||
new_data = np.concatenate((data_pad, seq_2d[low_2d:high_2d]), axis=0) |
|||
self.batch_2d = new_data[::self.tds] |
|||
#self.batch_2d = np.pad(seq_2d[low_2d:high_2d], ((pad_left_2d, pad_right_2d), (0, 0), (0, 0)), 'edge') |
|||
|
|||
elif pad_right_2d != 0: |
|||
data_pad = np.repeat(seq_2d[seq_2d.shape[0]-1:seq_2d.shape[0]], pad_right_2d, axis=0) |
|||
new_data = np.concatenate((seq_2d[low_2d:high_2d], data_pad), axis=0) |
|||
self.batch_2d = new_data[::self.tds] |
|||
#self.batch_2d = np.pad(seq_2d[low_2d:high_2d], ((pad_left_2d, pad_right_2d), (0, 0), (0, 0)), 'edge') |
|||
else: |
|||
self.batch_2d = seq_2d[low_2d:high_2d:self.tds] |
|||
|
|||
if flip: |
|||
self.batch_2d[ :, :, 0] *= -1 |
|||
self.batch_2d[ :, self.kps_left + self.kps_right] = self.batch_2d[ :, |
|||
self.kps_right + self.kps_left] |
|||
if reverse: |
|||
self.batch_2d = self.batch_2d[::-1].copy() |
|||
|
|||
if not self.MAE: |
|||
if self.poses_3d is not None: |
|||
seq_3d = self.poses_3d[seq_name].copy() |
|||
if self.out_all: |
|||
low_3d = low_2d |
|||
high_3d = high_2d |
|||
pad_left_3d = pad_left_2d |
|||
pad_right_3d = pad_right_2d |
|||
else: |
|||
low_3d = max(start_3d, 0) |
|||
high_3d = min(end_3d, seq_3d.shape[0]) |
|||
pad_left_3d = low_3d - start_3d |
|||
pad_right_3d = end_3d - high_3d |
|||
|
|||
if pad_left_3d != 0: |
|||
data_pad = np.repeat(seq_3d[0:1], pad_left_3d, axis=0) |
|||
new_data = np.concatenate((data_pad, seq_3d[low_3d:high_3d]), axis=0) |
|||
self.batch_3d = new_data[::self.tds] |
|||
elif pad_right_3d != 0: |
|||
data_pad = np.repeat(seq_3d[seq_3d.shape[0] - 1:seq_3d.shape[0]], pad_right_3d, axis=0) |
|||
new_data = np.concatenate((seq_3d[low_3d:high_3d], data_pad), axis=0) |
|||
self.batch_3d = new_data[::self.tds] |
|||
# self.batch_3d = np.pad(seq_3d[low_3d:high_3d], |
|||
# ((pad_left_3d, pad_right_3d), (0, 0), (0, 0)), 'edge') |
|||
else: |
|||
self.batch_3d = seq_3d[low_3d:high_3d:self.tds] |
|||
|
|||
if flip: |
|||
self.batch_3d[ :, :, 0] *= -1 |
|||
self.batch_3d[ :, self.joints_left + self.joints_right] = \ |
|||
self.batch_3d[ :, self.joints_right + self.joints_left] |
|||
if reverse: |
|||
self.batch_3d = self.batch_3d[::-1].copy() |
|||
|
|||
if self.cameras is not None: |
|||
self.batch_cam = self.cameras[seq_name].copy() |
|||
if flip: |
|||
self.batch_cam[ 2] *= -1 |
|||
self.batch_cam[ 7] *= -1 |
|||
|
|||
if self.MAE: |
|||
return self.batch_cam, self.batch_2d.copy(), action, subject, int(cam_index) |
|||
if self.poses_3d is None and self.cameras is None: |
|||
return None, None, self.batch_2d.copy(), action, subject, int(cam_index) |
|||
elif self.poses_3d is not None and self.cameras is None: |
|||
return np.zeros(9), self.batch_3d.copy(), self.batch_2d.copy(),action, subject, int(cam_index) |
|||
elif self.poses_3d is None: |
|||
return self.batch_cam, None, self.batch_2d.copy(),action, subject, int(cam_index) |
|||
else: |
|||
return self.batch_cam, self.batch_3d.copy(), self.batch_2d.copy(),action, subject, int(cam_index) |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,252 @@ |
|||
|
|||
import numpy as np |
|||
import copy |
|||
from common.skeleton import Skeleton |
|||
from common.mocap_dataset import MocapDataset |
|||
from common.camera import normalize_screen_coordinates |
|||
|
|||
h36m_skeleton = Skeleton(parents=[-1, 0, 1, 2, 3, 4, 0, 6, 7, 8, 9, 0, 11, 12, 13, 14, 12, |
|||
16, 17, 18, 19, 20, 19, 22, 12, 24, 25, 26, 27, 28, 27, 30], |
|||
joints_left=[6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 22, 23], |
|||
joints_right=[1, 2, 3, 4, 5, 24, 25, 26, 27, 28, 29, 30, 31]) |
|||
|
|||
h36m_cameras_intrinsic_params = [ |
|||
{ |
|||
'id': '54138969', |
|||
'center': [512.54150390625, 515.4514770507812], |
|||
'focal_length': [1145.0494384765625, 1143.7811279296875], |
|||
'radial_distortion': [-0.20709891617298126, 0.24777518212795258, -0.0030751503072679043], |
|||
'tangential_distortion': [-0.0009756988729350269, -0.00142447161488235], |
|||
'res_w': 1000, |
|||
'res_h': 1002, |
|||
'azimuth': 70, |
|||
}, |
|||
{ |
|||
'id': '55011271', |
|||
'center': [508.8486328125, 508.0649108886719], |
|||
'focal_length': [1149.6756591796875, 1147.5916748046875], |
|||
'radial_distortion': [-0.1942136287689209, 0.2404085397720337, 0.006819975562393665], |
|||
'tangential_distortion': [-0.0016190266469493508, -0.0027408944442868233], |
|||
'res_w': 1000, |
|||
'res_h': 1000, |
|||
'azimuth': -70, |
|||
}, |
|||
{ |
|||
'id': '58860488', |
|||
'center': [519.8158569335938, 501.40264892578125], |
|||
'focal_length': [1149.1407470703125, 1148.7989501953125], |
|||
'radial_distortion': [-0.2083381861448288, 0.25548800826072693, -0.0024604974314570427], |
|||
'tangential_distortion': [0.0014843869721516967, -0.0007599993259645998], |
|||
'res_w': 1000, |
|||
'res_h': 1000, |
|||
'azimuth': 110, |
|||
}, |
|||
{ |
|||
'id': '60457274', |
|||
'center': [514.9682006835938, 501.88201904296875], |
|||
'focal_length': [1145.5113525390625, 1144.77392578125], |
|||
'radial_distortion': [-0.198384091258049, 0.21832367777824402, -0.008947807364165783], |
|||
'tangential_distortion': [-0.0005872055771760643, -0.0018133620033040643], |
|||
'res_w': 1000, |
|||
'res_h': 1002, |
|||
'azimuth': -110, |
|||
}, |
|||
] |
|||
|
|||
h36m_cameras_extrinsic_params = { |
|||
'S1': [ |
|||
{ |
|||
'orientation': [0.1407056450843811, -0.1500701755285263, -0.755240797996521, 0.6223280429840088], |
|||
'translation': [1841.1070556640625, 4955.28466796875, 1563.4454345703125], |
|||
}, |
|||
{ |
|||
'orientation': [0.6157187819480896, -0.764836311340332, -0.14833825826644897, 0.11794740706682205], |
|||
'translation': [1761.278564453125, -5078.0068359375, 1606.2650146484375], |
|||
}, |
|||
{ |
|||
'orientation': [0.14651472866535187, -0.14647851884365082, 0.7653023600578308, -0.6094175577163696], |
|||
'translation': [-1846.7777099609375, 5215.04638671875, 1491.972412109375], |
|||
}, |
|||
{ |
|||
'orientation': [0.5834008455276489, -0.7853162288665771, 0.14548823237419128, -0.14749594032764435], |
|||
'translation': [-1794.7896728515625, -3722.698974609375, 1574.8927001953125], |
|||
}, |
|||
], |
|||
'S2': [ |
|||
{}, |
|||
{}, |
|||
{}, |
|||
{}, |
|||
], |
|||
'S3': [ |
|||
{}, |
|||
{}, |
|||
{}, |
|||
{}, |
|||
], |
|||
'S4': [ |
|||
{}, |
|||
{}, |
|||
{}, |
|||
{}, |
|||
], |
|||
'S5': [ |
|||
{ |
|||
'orientation': [0.1467377245426178, -0.162370964884758, -0.7551892995834351, 0.6178938746452332], |
|||
'translation': [2097.3916015625, 4880.94482421875, 1605.732421875], |
|||
}, |
|||
{ |
|||
'orientation': [0.6159758567810059, -0.7626792192459106, -0.15728192031383514, 0.1189815029501915], |
|||
'translation': [2031.7008056640625, -5167.93310546875, 1612.923095703125], |
|||
}, |
|||
{ |
|||
'orientation': [0.14291371405124664, -0.12907841801643372, 0.7678384780883789, -0.6110143065452576], |
|||
'translation': [-1620.5948486328125, 5171.65869140625, 1496.43701171875], |
|||
}, |
|||
{ |
|||
'orientation': [0.5920479893684387, -0.7814217805862427, 0.1274748593568802, -0.15036417543888092], |
|||
'translation': [-1637.1737060546875, -3867.3173828125, 1547.033203125], |
|||
}, |
|||
], |
|||
'S6': [ |
|||
{ |
|||
'orientation': [0.1337897777557373, -0.15692396461963654, -0.7571090459823608, 0.6198879480361938], |
|||
'translation': [1935.4517822265625, 4950.24560546875, 1618.0838623046875], |
|||
}, |
|||
{ |
|||
'orientation': [0.6147197484970093, -0.7628812789916992, -0.16174767911434174, 0.11819244921207428], |
|||
'translation': [1969.803955078125, -5128.73876953125, 1632.77880859375], |
|||
}, |
|||
{ |
|||
'orientation': [0.1529948115348816, -0.13529130816459656, 0.7646096348762512, -0.6112781167030334], |
|||
'translation': [-1769.596435546875, 5185.361328125, 1476.993408203125], |
|||
}, |
|||
{ |
|||
'orientation': [0.5916101336479187, -0.7804774045944214, 0.12832270562648773, -0.1561593860387802], |
|||
'translation': [-1721.668701171875, -3884.13134765625, 1540.4879150390625], |
|||
}, |
|||
], |
|||
'S7': [ |
|||
{ |
|||
'orientation': [0.1435241848230362, -0.1631336808204651, -0.7548328638076782, 0.6188824772834778], |
|||
'translation': [1974.512939453125, 4926.3544921875, 1597.8326416015625], |
|||
}, |
|||
{ |
|||
'orientation': [0.6141672730445862, -0.7638262510299683, -0.1596645563840866, 0.1177929937839508], |
|||
'translation': [1937.0584716796875, -5119.7900390625, 1631.5665283203125], |
|||
}, |
|||
{ |
|||
'orientation': [0.14550060033798218, -0.12874816358089447, 0.7660516500473022, -0.6127139329910278], |
|||
'translation': [-1741.8111572265625, 5208.24951171875, 1464.8245849609375], |
|||
}, |
|||
{ |
|||
'orientation': [0.5912848114967346, -0.7821764349937439, 0.12445473670959473, -0.15196487307548523], |
|||
'translation': [-1734.7105712890625, -3832.42138671875, 1548.5830078125], |
|||
}, |
|||
], |
|||
'S8': [ |
|||
{ |
|||
'orientation': [0.14110587537288666, -0.15589867532253265, -0.7561917304992676, 0.619644045829773], |
|||
'translation': [2150.65185546875, 4896.1611328125, 1611.9046630859375], |
|||
}, |
|||
{ |
|||
'orientation': [0.6169601678848267, -0.7647668123245239, -0.14846350252628326, 0.11158157885074615], |
|||
'translation': [2219.965576171875, -5148.453125, 1613.0440673828125], |
|||
}, |
|||
{ |
|||
'orientation': [0.1471444070339203, -0.13377119600772858, 0.7670128345489502, -0.6100369691848755], |
|||
'translation': [-1571.2215576171875, 5137.0185546875, 1498.1761474609375], |
|||
}, |
|||
{ |
|||
'orientation': [0.5927824378013611, -0.7825870513916016, 0.12147816270589828, -0.14631995558738708], |
|||
'translation': [-1476.913330078125, -3896.7412109375, 1547.97216796875], |
|||
}, |
|||
], |
|||
'S9': [ |
|||
{ |
|||
'orientation': [0.15540587902069092, -0.15548215806484222, -0.7532095313072205, 0.6199594736099243], |
|||
'translation': [2044.45849609375, 4935.1171875, 1481.2275390625], |
|||
}, |
|||
{ |
|||
'orientation': [0.618784487247467, -0.7634735107421875, -0.14132238924503326, 0.11933968216180801], |
|||
'translation': [1990.959716796875, -5123.810546875, 1568.8048095703125], |
|||
}, |
|||
{ |
|||
'orientation': [0.13357827067375183, -0.1367100477218628, 0.7689454555511475, -0.6100738644599915], |
|||
'translation': [-1670.9921875, 5211.98583984375, 1528.387939453125], |
|||
}, |
|||
{ |
|||
'orientation': [0.5879399180412292, -0.7823407053947449, 0.1427614390850067, -0.14794869720935822], |
|||
'translation': [-1696.04345703125, -3827.099853515625, 1591.4127197265625], |
|||
}, |
|||
], |
|||
'S11': [ |
|||
{ |
|||
'orientation': [0.15232472121715546, -0.15442320704460144, -0.7547563314437866, 0.6191070079803467], |
|||
'translation': [2098.440185546875, 4926.5546875, 1500.278564453125], |
|||
}, |
|||
{ |
|||
'orientation': [0.6189449429512024, -0.7600917220115662, -0.15300633013248444, 0.1255258321762085], |
|||
'translation': [2083.182373046875, -4912.1728515625, 1561.07861328125], |
|||
}, |
|||
{ |
|||
'orientation': [0.14943228662014008, -0.15650227665901184, 0.7681233882904053, -0.6026304364204407], |
|||
'translation': [-1609.8153076171875, 5177.3359375, 1537.896728515625], |
|||
}, |
|||
{ |
|||
'orientation': [0.5894251465797424, -0.7818877100944519, 0.13991211354732513, -0.14715361595153809], |
|||
'translation': [-1590.738037109375, -3854.1689453125, 1578.017578125], |
|||
}, |
|||
], |
|||
} |
|||
|
|||
|
|||
class Human36mDataset(MocapDataset): |
|||
def __init__(self, path, opt, remove_static_joints=True): |
|||
super().__init__(fps=50, skeleton=h36m_skeleton) |
|||
self.train_list = ['S1', 'S5', 'S6', 'S7', 'S8'] |
|||
self.test_list = ['S9', 'S11'] |
|||
|
|||
self._cameras = copy.deepcopy(h36m_cameras_extrinsic_params) |
|||
for cameras in self._cameras.values(): |
|||
for i, cam in enumerate(cameras): |
|||
cam.update(h36m_cameras_intrinsic_params[i]) |
|||
for k, v in cam.items(): |
|||
if k not in ['id', 'res_w', 'res_h']: |
|||
cam[k] = np.array(v, dtype='float32') |
|||
|
|||
if opt.crop_uv == 0: |
|||
cam['center'] = normalize_screen_coordinates(cam['center'], w=cam['res_w'], h=cam['res_h']).astype( |
|||
'float32') |
|||
cam['focal_length'] = cam['focal_length'] / cam['res_w'] * 2 |
|||
|
|||
if 'translation' in cam: |
|||
cam['translation'] = cam['translation'] / 1000 |
|||
|
|||
cam['intrinsic'] = np.concatenate((cam['focal_length'], |
|||
cam['center'], |
|||
cam['radial_distortion'], |
|||
cam['tangential_distortion'])) |
|||
|
|||
data = np.load(path,allow_pickle=True)['positions_3d'].item() |
|||
|
|||
self._data = {} |
|||
for subject, actions in data.items(): |
|||
self._data[subject] = {} |
|||
for action_name, positions in actions.items(): |
|||
self._data[subject][action_name] = { |
|||
'positions': positions, |
|||
'cameras': self._cameras[subject], |
|||
} |
|||
|
|||
if remove_static_joints: |
|||
self.remove_joints([4, 5, 9, 10, 11, 16, 20, 21, 22, 23, 24, 28, 29, 30, 31]) |
|||
|
|||
self._skeleton._parents[11] = 8 |
|||
self._skeleton._parents[14] = 8 |
|||
|
|||
def supports_semi_supervised(self): |
|||
return True |
|||
|
|||
|
|||
|
@ -0,0 +1,194 @@ |
|||
|
|||
import torch.utils.data as data |
|||
import numpy as np |
|||
|
|||
from common.utils import deterministic_random |
|||
from common.camera import world_to_camera, normalize_screen_coordinates |
|||
from common.generator_3dhp import ChunkedGenerator |
|||
|
|||
class Fusion(data.Dataset): |
|||
def __init__(self, opt, root_path, train=True, MAE=False): |
|||
self.data_type = opt.dataset |
|||
self.train = train |
|||
self.keypoints_name = opt.keypoints |
|||
self.root_path = root_path |
|||
|
|||
self.train_list = opt.subjects_train.split(',') |
|||
self.test_list = opt.subjects_test.split(',') |
|||
self.action_filter = None if opt.actions == '*' else opt.actions.split(',') |
|||
self.downsample = opt.downsample |
|||
self.subset = opt.subset |
|||
self.stride = opt.stride |
|||
self.crop_uv = opt.crop_uv |
|||
self.test_aug = opt.test_augmentation |
|||
self.pad = opt.pad |
|||
self.MAE=MAE |
|||
if self.train: |
|||
self.poses_train, self.poses_train_2d = self.prepare_data(opt.root_path, train=True) |
|||
# self.cameras_train, self.poses_train, self.poses_train_2d = self.fetch(dataset, self.train_list, |
|||
# subset=self.subset) |
|||
self.generator = ChunkedGenerator(opt.batchSize // opt.stride, None, self.poses_train, |
|||
self.poses_train_2d, None, chunk_length=self.stride, pad=self.pad, |
|||
augment=opt.data_augmentation, reverse_aug=opt.reverse_augmentation, |
|||
kps_left=self.kps_left, kps_right=self.kps_right, |
|||
joints_left=self.joints_left, |
|||
joints_right=self.joints_right, out_all=opt.out_all, MAE=MAE, train = True) |
|||
print('INFO: Training on {} frames'.format(self.generator.num_frames())) |
|||
else: |
|||
self.poses_test, self.poses_test_2d, self.valid_frame = self.prepare_data(opt.root_path, train=False) |
|||
# self.cameras_test, self.poses_test, self.poses_test_2d = self.fetch(dataset, self.test_list, |
|||
# subset=self.subset) |
|||
self.generator = ChunkedGenerator(opt.batchSize // opt.stride, None, self.poses_test, |
|||
self.poses_test_2d, self.valid_frame, |
|||
pad=self.pad, augment=False, kps_left=self.kps_left, |
|||
kps_right=self.kps_right, joints_left=self.joints_left, |
|||
joints_right=self.joints_right, MAE=MAE, train = False) |
|||
self.key_index = self.generator.saved_index |
|||
print('INFO: Testing on {} frames'.format(self.generator.num_frames())) |
|||
|
|||
def prepare_data(self, path, train=True): |
|||
out_poses_3d = {} |
|||
out_poses_2d = {} |
|||
valid_frame={} |
|||
|
|||
self.kps_left, self.kps_right = [5, 6, 7, 11, 12, 13], [2, 3, 4, 8, 9, 10] |
|||
self.joints_left, self.joints_right = [5, 6, 7, 11, 12, 13], [2, 3, 4, 8, 9, 10] |
|||
|
|||
if train == True: |
|||
data = np.load(path+"data_train_3dhp.npz",allow_pickle=True)['data'].item() |
|||
for seq in data.keys(): |
|||
for cam in data[seq][0].keys(): |
|||
anim = data[seq][0][cam] |
|||
|
|||
subject_name, seq_name = seq.split(" ") |
|||
|
|||
data_3d = anim['data_3d'] |
|||
data_3d[:, :14] -= data_3d[:, 14:15] |
|||
data_3d[:, 15:] -= data_3d[:, 14:15] |
|||
out_poses_3d[(subject_name, seq_name, cam)] = data_3d |
|||
|
|||
data_2d = anim['data_2d'] |
|||
|
|||
data_2d[..., :2] = normalize_screen_coordinates(data_2d[..., :2], w=2048, h=2048) |
|||
out_poses_2d[(subject_name, seq_name, cam)]=data_2d |
|||
|
|||
return out_poses_3d, out_poses_2d |
|||
else: |
|||
data = np.load(path + "data_test_3dhp.npz", allow_pickle=True)['data'].item() |
|||
for seq in data.keys(): |
|||
|
|||
anim = data[seq] |
|||
|
|||
valid_frame[seq] = anim["valid"] |
|||
|
|||
data_3d = anim['data_3d'] |
|||
data_3d[:, :14] -= data_3d[:, 14:15] |
|||
data_3d[:, 15:] -= data_3d[:, 14:15] |
|||
out_poses_3d[seq] = data_3d |
|||
|
|||
data_2d = anim['data_2d'] |
|||
|
|||
if seq == "TS5" or seq == "TS6": |
|||
width = 1920 |
|||
height = 1080 |
|||
else: |
|||
width = 2048 |
|||
height = 2048 |
|||
data_2d[..., :2] = normalize_screen_coordinates(data_2d[..., :2], w=width, h=height) |
|||
out_poses_2d[seq] = data_2d |
|||
|
|||
return out_poses_3d, out_poses_2d, valid_frame |
|||
|
|||
def fetch(self, dataset, subjects, subset=1, parse_3d_poses=True): |
|||
out_poses_3d = {} |
|||
out_poses_2d = {} |
|||
out_camera_params = {} |
|||
|
|||
for subject in subjects: |
|||
for action in self.keypoints[subject].keys(): |
|||
if self.action_filter is not None: |
|||
found = False |
|||
for a in self.action_filter: |
|||
if action.startswith(a): |
|||
found = True |
|||
break |
|||
if not found: |
|||
continue |
|||
|
|||
poses_2d = self.keypoints[subject][action] |
|||
|
|||
for i in range(len(poses_2d)): |
|||
out_poses_2d[(subject, action, i)] = poses_2d[i] |
|||
|
|||
if subject in dataset.cameras(): |
|||
cams = dataset.cameras()[subject] |
|||
assert len(cams) == len(poses_2d), 'Camera count mismatch' |
|||
for i, cam in enumerate(cams): |
|||
if 'intrinsic' in cam: |
|||
out_camera_params[(subject, action, i)] = cam['intrinsic'] |
|||
|
|||
if parse_3d_poses and 'positions_3d' in dataset[subject][action]: |
|||
poses_3d = dataset[subject][action]['positions_3d'] |
|||
assert len(poses_3d) == len(poses_2d), 'Camera count mismatch' |
|||
for i in range(len(poses_3d)): |
|||
out_poses_3d[(subject, action, i)] = poses_3d[i] |
|||
|
|||
if len(out_camera_params) == 0: |
|||
out_camera_params = None |
|||
if len(out_poses_3d) == 0: |
|||
out_poses_3d = None |
|||
|
|||
stride = self.downsample |
|||
if subset < 1: |
|||
for key in out_poses_2d.keys(): |
|||
n_frames = int(round(len(out_poses_2d[key]) // stride * subset) * stride) |
|||
start = deterministic_random(0, len(out_poses_2d[key]) - n_frames + 1, str(len(out_poses_2d[key]))) |
|||
out_poses_2d[key] = out_poses_2d[key][start:start + n_frames:stride] |
|||
if out_poses_3d is not None: |
|||
out_poses_3d[key] = out_poses_3d[key][start:start + n_frames:stride] |
|||
elif stride > 1: |
|||
for key in out_poses_2d.keys(): |
|||
out_poses_2d[key] = out_poses_2d[key][::stride] |
|||
if out_poses_3d is not None: |
|||
out_poses_3d[key] = out_poses_3d[key][::stride] |
|||
|
|||
return out_camera_params, out_poses_3d, out_poses_2d |
|||
|
|||
def __len__(self): |
|||
return len(self.generator.pairs) |
|||
#return 200 |
|||
|
|||
def __getitem__(self, index): |
|||
seq_name, start_3d, end_3d, flip, reverse = self.generator.pairs[index] |
|||
|
|||
if self.MAE: |
|||
cam, input_2D, seq, subject, cam_ind = self.generator.get_batch(seq_name, start_3d, end_3d, flip, |
|||
reverse) |
|||
if self.train == False and self.test_aug: |
|||
_, input_2D_aug, _, _,_ = self.generator.get_batch(seq_name, start_3d, end_3d, flip=True, reverse=reverse) |
|||
input_2D = np.concatenate((np.expand_dims(input_2D,axis=0),np.expand_dims(input_2D_aug,axis=0)),0) |
|||
else: |
|||
cam, gt_3D, input_2D, seq, subject, cam_ind = self.generator.get_batch(seq_name, start_3d, end_3d, flip, reverse) |
|||
|
|||
if self.train == False and self.test_aug: |
|||
_, _, input_2D_aug, _, _,_ = self.generator.get_batch(seq_name, start_3d, end_3d, flip=True, reverse=reverse) |
|||
input_2D = np.concatenate((np.expand_dims(input_2D,axis=0),np.expand_dims(input_2D_aug,axis=0)),0) |
|||
|
|||
bb_box = np.array([0, 0, 1, 1]) |
|||
input_2D_update = input_2D |
|||
|
|||
scale = np.float(1.0) |
|||
|
|||
if self.MAE: |
|||
if self.train == True: |
|||
return cam, input_2D_update, seq, subject, scale, bb_box, cam_ind |
|||
else: |
|||
return cam, input_2D_update, seq, scale, bb_box |
|||
else: |
|||
if self.train == True: |
|||
return cam, gt_3D, input_2D_update, seq, subject, scale, bb_box, cam_ind |
|||
else: |
|||
return cam, gt_3D, input_2D_update, seq, scale, bb_box |
|||
|
|||
|
|||
|
@ -0,0 +1,180 @@ |
|||
|
|||
import torch.utils.data as data |
|||
import numpy as np |
|||
|
|||
from common.utils import deterministic_random |
|||
from common.camera import world_to_camera, normalize_screen_coordinates |
|||
from common.generator_tds import ChunkedGenerator |
|||
|
|||
class Fusion(data.Dataset): |
|||
def __init__(self, opt, dataset, root_path, train=True, MAE=False, tds=1): |
|||
self.data_type = opt.dataset |
|||
self.train = train |
|||
self.keypoints_name = opt.keypoints |
|||
self.root_path = root_path |
|||
|
|||
self.train_list = opt.subjects_train.split(',') |
|||
self.test_list = opt.subjects_test.split(',') |
|||
self.action_filter = None if opt.actions == '*' else opt.actions.split(',') |
|||
self.downsample = opt.downsample |
|||
self.subset = opt.subset |
|||
self.stride = opt.stride |
|||
self.crop_uv = opt.crop_uv |
|||
self.test_aug = opt.test_augmentation |
|||
self.pad = opt.pad |
|||
self.MAE=MAE |
|||
if self.train: |
|||
self.keypoints = self.prepare_data(dataset, self.train_list) |
|||
self.cameras_train, self.poses_train, self.poses_train_2d = self.fetch(dataset, self.train_list, |
|||
subset=self.subset) |
|||
self.generator = ChunkedGenerator(opt.batchSize // opt.stride, self.cameras_train, self.poses_train, |
|||
self.poses_train_2d, self.stride, pad=self.pad, |
|||
augment=opt.data_augmentation, reverse_aug=opt.reverse_augmentation, |
|||
kps_left=self.kps_left, kps_right=self.kps_right, |
|||
joints_left=self.joints_left, |
|||
joints_right=self.joints_right, out_all=opt.out_all, MAE=MAE, tds=tds) |
|||
print('INFO: Training on {} frames'.format(self.generator.num_frames())) |
|||
else: |
|||
self.keypoints = self.prepare_data(dataset, self.test_list) |
|||
self.cameras_test, self.poses_test, self.poses_test_2d = self.fetch(dataset, self.test_list, |
|||
subset=self.subset) |
|||
self.generator = ChunkedGenerator(opt.batchSize // opt.stride, self.cameras_test, self.poses_test, |
|||
self.poses_test_2d, |
|||
pad=self.pad, augment=False, kps_left=self.kps_left, |
|||
kps_right=self.kps_right, joints_left=self.joints_left, |
|||
joints_right=self.joints_right, MAE=MAE, tds=tds) |
|||
self.key_index = self.generator.saved_index |
|||
print('INFO: Testing on {} frames'.format(self.generator.num_frames())) |
|||
|
|||
def prepare_data(self, dataset, folder_list): |
|||
for subject in folder_list: |
|||
for action in dataset[subject].keys(): |
|||
anim = dataset[subject][action] |
|||
|
|||
positions_3d = [] |
|||
for cam in anim['cameras']: |
|||
pos_3d = world_to_camera(anim['positions'], R=cam['orientation'], t=cam['translation']) |
|||
pos_3d[:, 1:] -= pos_3d[:, :1] |
|||
|
|||
if self.keypoints_name.startswith('sh'): |
|||
pos_3d = np.delete(pos_3d,obj=9,axis=1) |
|||
positions_3d.append(pos_3d) |
|||
anim['positions_3d'] = positions_3d |
|||
|
|||
keypoints = np.load(self.root_path + 'data_2d_' + self.data_type + '_' + self.keypoints_name + '.npz',allow_pickle=True) |
|||
keypoints_symmetry = keypoints['metadata'].item()['keypoints_symmetry'] |
|||
|
|||
self.kps_left, self.kps_right = list(keypoints_symmetry[0]), list(keypoints_symmetry[1]) |
|||
self.joints_left, self.joints_right = list(dataset.skeleton().joints_left()), list(dataset.skeleton().joints_right()) |
|||
keypoints = keypoints['positions_2d'].item() |
|||
|
|||
for subject in folder_list: |
|||
assert subject in keypoints, 'Subject {} is missing from the 2D detections dataset'.format(subject) |
|||
for action in dataset[subject].keys(): |
|||
assert action in keypoints[subject], 'Action {} of subject {} is missing from the 2D detections dataset'.format(action, |
|||
subject) |
|||
for cam_idx in range(len(keypoints[subject][action])): |
|||
|
|||
mocap_length = dataset[subject][action]['positions_3d'][cam_idx].shape[0] |
|||
assert keypoints[subject][action][cam_idx].shape[0] >= mocap_length |
|||
|
|||
if keypoints[subject][action][cam_idx].shape[0] > mocap_length: |
|||
keypoints[subject][action][cam_idx] = keypoints[subject][action][cam_idx][:mocap_length] |
|||
|
|||
for subject in keypoints.keys(): |
|||
for action in keypoints[subject]: |
|||
for cam_idx, kps in enumerate(keypoints[subject][action]): |
|||
cam = dataset.cameras()[subject][cam_idx] |
|||
if self.crop_uv == 0: |
|||
kps[..., :2] = normalize_screen_coordinates(kps[..., :2], w=cam['res_w'], h=cam['res_h']) |
|||
keypoints[subject][action][cam_idx] = kps |
|||
|
|||
return keypoints |
|||
|
|||
def fetch(self, dataset, subjects, subset=1, parse_3d_poses=True): |
|||
out_poses_3d = {} |
|||
out_poses_2d = {} |
|||
out_camera_params = {} |
|||
|
|||
for subject in subjects: |
|||
for action in self.keypoints[subject].keys(): |
|||
if self.action_filter is not None: |
|||
found = False |
|||
for a in self.action_filter: |
|||
if action.startswith(a): |
|||
found = True |
|||
break |
|||
if not found: |
|||
continue |
|||
|
|||
poses_2d = self.keypoints[subject][action] |
|||
|
|||
for i in range(len(poses_2d)): |
|||
out_poses_2d[(subject, action, i)] = poses_2d[i] |
|||
|
|||
if subject in dataset.cameras(): |
|||
cams = dataset.cameras()[subject] |
|||
assert len(cams) == len(poses_2d), 'Camera count mismatch' |
|||
for i, cam in enumerate(cams): |
|||
if 'intrinsic' in cam: |
|||
out_camera_params[(subject, action, i)] = cam['intrinsic'] |
|||
|
|||
if parse_3d_poses and 'positions_3d' in dataset[subject][action]: |
|||
poses_3d = dataset[subject][action]['positions_3d'] |
|||
assert len(poses_3d) == len(poses_2d), 'Camera count mismatch' |
|||
for i in range(len(poses_3d)): |
|||
out_poses_3d[(subject, action, i)] = poses_3d[i] |
|||
|
|||
if len(out_camera_params) == 0: |
|||
out_camera_params = None |
|||
if len(out_poses_3d) == 0: |
|||
out_poses_3d = None |
|||
|
|||
stride = self.downsample |
|||
if subset < 1: |
|||
for key in out_poses_2d.keys(): |
|||
n_frames = int(round(len(out_poses_2d[key]) // stride * subset) * stride) |
|||
start = deterministic_random(0, len(out_poses_2d[key]) - n_frames + 1, str(len(out_poses_2d[key]))) |
|||
out_poses_2d[key] = out_poses_2d[key][start:start + n_frames:stride] |
|||
if out_poses_3d is not None: |
|||
out_poses_3d[key] = out_poses_3d[key][start:start + n_frames:stride] |
|||
elif stride > 1: |
|||
for key in out_poses_2d.keys(): |
|||
out_poses_2d[key] = out_poses_2d[key][::stride] |
|||
if out_poses_3d is not None: |
|||
out_poses_3d[key] = out_poses_3d[key][::stride] |
|||
|
|||
return out_camera_params, out_poses_3d, out_poses_2d |
|||
|
|||
def __len__(self): |
|||
return len(self.generator.pairs) |
|||
#return 200 |
|||
|
|||
def __getitem__(self, index): |
|||
seq_name, start_3d, end_3d, flip, reverse = self.generator.pairs[index] |
|||
|
|||
if self.MAE: |
|||
cam, input_2D, action, subject, cam_ind = self.generator.get_batch(seq_name, start_3d, end_3d, flip, |
|||
reverse) |
|||
if self.train == False and self.test_aug: |
|||
_, input_2D_aug, _, _,_ = self.generator.get_batch(seq_name, start_3d, end_3d, flip=True, reverse=reverse) |
|||
input_2D = np.concatenate((np.expand_dims(input_2D,axis=0),np.expand_dims(input_2D_aug,axis=0)),0) |
|||
else: |
|||
cam, gt_3D, input_2D, action, subject, cam_ind = self.generator.get_batch(seq_name, start_3d, end_3d, flip, reverse) |
|||
|
|||
if self.train == False and self.test_aug: |
|||
_, _, input_2D_aug, _, _,_ = self.generator.get_batch(seq_name, start_3d, end_3d, flip=True, reverse=reverse) |
|||
input_2D = np.concatenate((np.expand_dims(input_2D,axis=0),np.expand_dims(input_2D_aug,axis=0)),0) |
|||
|
|||
bb_box = np.array([0, 0, 1, 1]) |
|||
input_2D_update = input_2D |
|||
|
|||
scale = np.float64(1.0) |
|||
|
|||
if self.MAE: |
|||
return cam, input_2D_update, action, subject, scale, bb_box, cam_ind |
|||
else: |
|||
return cam, gt_3D, input_2D_update, action, subject, scale, bb_box, cam_ind |
|||
|
|||
|
|||
|
@ -0,0 +1,181 @@ |
|||
|
|||
import torch.utils.data as data |
|||
import numpy as np |
|||
|
|||
from common.utils import deterministic_random |
|||
from common.camera import world_to_camera, normalize_screen_coordinates |
|||
from common.generator_tds import ChunkedGenerator |
|||
|
|||
class Fusion(data.Dataset): |
|||
def __init__(self, opt, dataset, root_path, train=True, MAE=False, tds=1): |
|||
self.data_type = opt.dataset |
|||
self.train = train |
|||
self.keypoints_name = opt.keypoints |
|||
self.root_path = root_path |
|||
|
|||
self.train_list = opt.subjects_train.split(',') |
|||
self.test_list = opt.subjects_test.split(',') |
|||
self.action_filter = None if opt.actions == '*' else opt.actions.split(',') |
|||
self.downsample = opt.downsample |
|||
self.subset = opt.subset |
|||
self.stride = opt.stride |
|||
self.crop_uv = opt.crop_uv |
|||
self.test_aug = opt.test_augmentation |
|||
self.pad = opt.pad |
|||
self.MAE=MAE |
|||
if self.train: |
|||
self.keypoints = self.prepare_data(dataset, self.train_list) |
|||
self.cameras_train, self.poses_train, self.poses_train_2d = self.fetch(dataset, self.train_list, |
|||
subset=self.subset) |
|||
self.generator = ChunkedGenerator(opt.batchSize // opt.stride, self.cameras_train, self.poses_train, |
|||
self.poses_train_2d, self.stride, pad=self.pad, |
|||
augment=opt.data_augmentation, reverse_aug=opt.reverse_augmentation, |
|||
kps_left=self.kps_left, kps_right=self.kps_right, |
|||
joints_left=self.joints_left, |
|||
joints_right=self.joints_right, out_all=opt.out_all, MAE=MAE, tds=tds) |
|||
print('INFO: Training on {} frames'.format(self.generator.num_frames())) |
|||
else: |
|||
self.keypoints = self.prepare_data(dataset, self.test_list) |
|||
self.cameras_test, self.poses_test, self.poses_test_2d = self.fetch(dataset, self.test_list, |
|||
subset=self.subset) |
|||
self.generator = ChunkedGenerator(opt.batchSize // opt.stride, self.cameras_test, self.poses_test, |
|||
self.poses_test_2d, |
|||
pad=self.pad, augment=False, kps_left=self.kps_left, |
|||
kps_right=self.kps_right, joints_left=self.joints_left, |
|||
joints_right=self.joints_right, MAE=MAE, tds=tds) |
|||
self.key_index = self.generator.saved_index |
|||
print('INFO: Testing on {} frames'.format(self.generator.num_frames())) |
|||
|
|||
def prepare_data(self, dataset, folder_list): |
|||
for subject in folder_list: |
|||
for action in dataset[subject].keys(): |
|||
anim = dataset[subject][action] |
|||
|
|||
positions_3d = [] |
|||
for cam in anim['cameras']: |
|||
pos_3d = world_to_camera(anim['positions'], R=cam['orientation'], t=cam['translation']) |
|||
pos_3d[:, 1:] -= pos_3d[:, :1] |
|||
|
|||
if self.keypoints_name.startswith('sh'): |
|||
pos_3d = np.delete(pos_3d,obj=9,axis=1) |
|||
positions_3d.append(pos_3d) |
|||
anim['positions_3d'] = positions_3d |
|||
|
|||
keypoints = np.load(self.root_path + 'data_2d_' + self.data_type + '_' + self.keypoints_name + '.npz',allow_pickle=True) |
|||
keypoints_symmetry = keypoints['metadata'].item()['keypoints_symmetry'] |
|||
|
|||
self.kps_left, self.kps_right = list(keypoints_symmetry[0]), list(keypoints_symmetry[1]) |
|||
self.joints_left, self.joints_right = list(dataset.skeleton().joints_left()), list(dataset.skeleton().joints_right()) |
|||
keypoints = keypoints['positions_2d'].item() |
|||
|
|||
for subject in folder_list: |
|||
assert subject in keypoints, 'Subject {} is missing from the 2D detections dataset'.format(subject) |
|||
for action in dataset[subject].keys(): |
|||
assert action in keypoints[ |
|||
subject], 'Action {} of subject {} is missing from the 2D detections dataset'.format(action, |
|||
subject) |
|||
for cam_idx in range(len(keypoints[subject][action])): |
|||
|
|||
mocap_length = dataset[subject][action]['positions_3d'][cam_idx].shape[0] |
|||
assert keypoints[subject][action][cam_idx].shape[0] >= mocap_length |
|||
|
|||
if keypoints[subject][action][cam_idx].shape[0] > mocap_length: |
|||
keypoints[subject][action][cam_idx] = keypoints[subject][action][cam_idx][:mocap_length] |
|||
|
|||
for subject in keypoints.keys(): |
|||
for action in keypoints[subject]: |
|||
for cam_idx, kps in enumerate(keypoints[subject][action]): |
|||
cam = dataset.cameras()[subject][cam_idx] |
|||
if self.crop_uv == 0: |
|||
kps[..., :2] = normalize_screen_coordinates(kps[..., :2], w=cam['res_w'], h=cam['res_h']) |
|||
keypoints[subject][action][cam_idx] = kps |
|||
|
|||
return keypoints |
|||
|
|||
def fetch(self, dataset, subjects, subset=1, parse_3d_poses=True): |
|||
out_poses_3d = {} |
|||
out_poses_2d = {} |
|||
out_camera_params = {} |
|||
|
|||
for subject in subjects: |
|||
for action in self.keypoints[subject].keys(): |
|||
if self.action_filter is not None: |
|||
found = False |
|||
for a in self.action_filter: |
|||
if action.startswith(a): |
|||
found = True |
|||
break |
|||
if not found: |
|||
continue |
|||
|
|||
poses_2d = self.keypoints[subject][action] |
|||
|
|||
for i in range(len(poses_2d)): |
|||
out_poses_2d[(subject, action, i)] = poses_2d[i][..., :2] |
|||
|
|||
if subject in dataset.cameras(): |
|||
cams = dataset.cameras()[subject] |
|||
assert len(cams) == len(poses_2d), 'Camera count mismatch' |
|||
for i, cam in enumerate(cams): |
|||
if 'intrinsic' in cam: |
|||
out_camera_params[(subject, action, i)] = cam['intrinsic'] |
|||
|
|||
if parse_3d_poses and 'positions_3d' in dataset[subject][action]: |
|||
poses_3d = dataset[subject][action]['positions_3d'] |
|||
assert len(poses_3d) == len(poses_2d), 'Camera count mismatch' |
|||
for i in range(len(poses_3d)): |
|||
out_poses_3d[(subject, action, i)] = poses_3d[i] |
|||
|
|||
if len(out_camera_params) == 0: |
|||
out_camera_params = None |
|||
if len(out_poses_3d) == 0: |
|||
out_poses_3d = None |
|||
|
|||
stride = self.downsample |
|||
if subset < 1: |
|||
for key in out_poses_2d.keys(): |
|||
n_frames = int(round(len(out_poses_2d[key]) // stride * subset) * stride) |
|||
start = deterministic_random(0, len(out_poses_2d[key]) - n_frames + 1, str(len(out_poses_2d[key]))) |
|||
out_poses_2d[key] = out_poses_2d[key][start:start + n_frames:stride] |
|||
if out_poses_3d is not None: |
|||
out_poses_3d[key] = out_poses_3d[key][start:start + n_frames:stride] |
|||
elif stride > 1: |
|||
for key in out_poses_2d.keys(): |
|||
out_poses_2d[key] = out_poses_2d[key][::stride] |
|||
if out_poses_3d is not None: |
|||
out_poses_3d[key] = out_poses_3d[key][::stride] |
|||
|
|||
return out_camera_params, out_poses_3d, out_poses_2d |
|||
|
|||
def __len__(self): |
|||
return len(self.generator.pairs) |
|||
#return 200 |
|||
|
|||
def __getitem__(self, index): |
|||
seq_name, start_3d, end_3d, flip, reverse = self.generator.pairs[index] |
|||
|
|||
if self.MAE: |
|||
cam, input_2D, action, subject, cam_ind = self.generator.get_batch(seq_name, start_3d, end_3d, flip, |
|||
reverse) |
|||
if self.train == False and self.test_aug: |
|||
_, input_2D_aug, _, _,_ = self.generator.get_batch(seq_name, start_3d, end_3d, flip=True, reverse=reverse) |
|||
input_2D = np.concatenate((np.expand_dims(input_2D,axis=0),np.expand_dims(input_2D_aug,axis=0)),0) |
|||
else: |
|||
cam, gt_3D, input_2D, action, subject, cam_ind = self.generator.get_batch(seq_name, start_3d, end_3d, flip, reverse) |
|||
|
|||
if self.train == False and self.test_aug: |
|||
_, _, input_2D_aug, _, _,_ = self.generator.get_batch(seq_name, start_3d, end_3d, flip=True, reverse=reverse) |
|||
input_2D = np.concatenate((np.expand_dims(input_2D,axis=0),np.expand_dims(input_2D_aug,axis=0)),0) |
|||
|
|||
bb_box = np.array([0, 0, 1, 1]) |
|||
input_2D_update = input_2D |
|||
|
|||
scale = np.float(1.0) |
|||
|
|||
if self.MAE: |
|||
return cam, input_2D_update, action, subject, scale, bb_box, cam_ind |
|||
else: |
|||
return cam, gt_3D, input_2D_update, action, subject, scale, bb_box, cam_ind |
|||
|
|||
|
|||
|
@ -0,0 +1,239 @@ |
|||
|
|||
import torch.utils.data as data |
|||
import numpy as np |
|||
|
|||
from common.utils import deterministic_random |
|||
from common.camera import world_to_camera, normalize_screen_coordinates |
|||
from common.generator_tds import ChunkedGenerator |
|||
|
|||
import os.path as path |
|||
|
|||
class Fusion(data.Dataset): |
|||
def __init__(self, opt, dataset, root_path, train=True, MAE=False, tds=1): |
|||
self.data_type = opt.dataset |
|||
self.train = train |
|||
self.keypoints_name = opt.keypoints |
|||
self.root_path = root_path |
|||
|
|||
self.train_list = opt.subjects_train.split(',') |
|||
self.test_list = opt.subjects_test.split(',') |
|||
self.action_filter = None if opt.actions == '*' else opt.actions.split(',') |
|||
self.downsample = opt.downsample |
|||
self.subset = opt.subset |
|||
self.stride = opt.stride |
|||
self.crop_uv = opt.crop_uv |
|||
self.test_aug = opt.test_augmentation |
|||
self.pad = opt.pad |
|||
self.MAE=MAE |
|||
tds = opt.t_downsample |
|||
#print(tds) |
|||
#exit() |
|||
if self.train: |
|||
self.keypoints = self.prepare_data(dataset, self.train_list) |
|||
self.cameras_train, self.poses_train, self.poses_train_2d = self.fetch(dataset, self.train_list, |
|||
subset=self.subset) |
|||
self.generator = ChunkedGenerator(opt.batchSize // opt.stride, self.cameras_train, self.poses_train, |
|||
self.poses_train_2d, self.stride, pad=self.pad, |
|||
augment=opt.data_augmentation, reverse_aug=opt.reverse_augmentation, |
|||
kps_left=self.kps_left, kps_right=self.kps_right, |
|||
joints_left=self.joints_left, |
|||
joints_right=self.joints_right, out_all=opt.out_all, MAE=MAE, tds=tds) |
|||
print('INFO: Training on {} frames'.format(self.generator.num_frames())) |
|||
else: |
|||
self.keypoints = self.prepare_data(dataset, self.test_list) |
|||
self.cameras_test, self.poses_test, self.poses_test_2d = self.fetch(dataset, self.test_list, |
|||
subset=self.subset) |
|||
self.generator = ChunkedGenerator(opt.batchSize // opt.stride, self.cameras_test, self.poses_test, |
|||
self.poses_test_2d, |
|||
pad=self.pad, augment=False, kps_left=self.kps_left, |
|||
kps_right=self.kps_right, joints_left=self.joints_left, |
|||
joints_right=self.joints_right, MAE=MAE, tds=tds) |
|||
self.key_index = self.generator.saved_index |
|||
print('INFO: Testing on {} frames'.format(self.generator.num_frames())) |
|||
|
|||
def prepare_data(self, dataset, folder_list): |
|||
HR_PATH= self.root_path+'HRN' |
|||
camara_path=path.join(self.root_path, 'data_2d_h36m_gt' + '.npz') |
|||
|
|||
keypoints_train3d_path=path.join(HR_PATH, 'threeDPose_train' + '.npy') |
|||
keypoints_test3d_path=path.join(HR_PATH, 'threeDPose_test' + '.npy') |
|||
hrn_train3d = np.load(keypoints_train3d_path, allow_pickle=True) |
|||
hrn_test3d = np.load(keypoints_test3d_path, allow_pickle=True) |
|||
|
|||
re_order = [0,1,2,3,6,7,8,12,13,14,15,17,18,19,25,26,27] |
|||
for key in hrn_train3d.item().keys(): |
|||
# print(key) |
|||
subject = 'S'+str(key[0]) |
|||
action = key[2].split('.')[0] |
|||
hrn_train_key=hrn_train3d.item()[key].reshape(-1,32,3) |
|||
hrn_train_key=hrn_train_key[:,re_order,:] |
|||
anim = dataset[subject][action] |
|||
positions_3d = [] |
|||
for cam in anim['cameras']: |
|||
pos_3d = world_to_camera(hrn_train_key.astype(cam['orientation'].dtype)/1000, R=cam['orientation'], t=cam['translation']) |
|||
pos_3d[:, 1:] -= pos_3d[:, :1] |
|||
positions_3d.append(pos_3d) |
|||
anim['positions_3d'] = positions_3d |
|||
|
|||
for key in hrn_test3d.item().keys(): |
|||
subject = 'S'+str(key[0]) |
|||
action = key[2].split('.')[0] |
|||
hrn_test_key=hrn_test3d.item()[key].reshape(-1,32,3) |
|||
hrn_test_key=hrn_test_key[:,re_order,:] |
|||
if subject=='S11' and action == 'Directions': |
|||
continue |
|||
anim = dataset[subject][action] |
|||
positions_3d = [] |
|||
for cam in anim['cameras']: |
|||
pos_3d = world_to_camera(hrn_test_key.astype(cam['orientation'].dtype)/1000, R=cam['orientation'], t=cam['translation']) |
|||
#pos_3d = hrn_test_key |
|||
pos_3d[:, 1:] -= pos_3d[:, :1] |
|||
positions_3d.append(pos_3d) |
|||
anim['positions_3d'] = positions_3d |
|||
|
|||
|
|||
|
|||
keypoints_train=path.join(HR_PATH, 'twoDPose_HRN_train' + '.npy') |
|||
keypoints_test=path.join(HR_PATH, 'twoDPose_HRN_test' + '.npy') |
|||
keypoints_symmetry = [[4,5,6,11,12,13],[1,2,3,14,15,16]] |
|||
|
|||
keypoints = self.create_2d_data(keypoints_train,keypoints_test,camara_path,dataset) |
|||
self.kps_left, self.kps_right = list(keypoints_symmetry[0]), list(keypoints_symmetry[1]) |
|||
self.joints_left, self.joints_right = list(dataset.skeleton().joints_left()), list(dataset.skeleton().joints_right()) |
|||
|
|||
|
|||
return keypoints |
|||
|
|||
|
|||
|
|||
def create_2d_data(self,train_path,test_path,camera_path,dataset): |
|||
keypoints = np.load(camera_path, allow_pickle=True) |
|||
# print(keypoints.keys) |
|||
# exit() |
|||
keypoints = keypoints['positions_2d'].item() |
|||
|
|||
|
|||
re_order = [0,1,2,3,6,7,8,12,13,14,15,17,18,19,25,26,27] |
|||
hrn_train = np.load(train_path, allow_pickle=True) |
|||
hrn_test = np.load(test_path, allow_pickle=True) |
|||
for key in hrn_train.item().keys(): |
|||
subject = 'S'+str(key[0]) |
|||
# print(key) |
|||
# exit() |
|||
action = key[2].split('.')[0] |
|||
hr_cam = key[2].split('.')[1] |
|||
hrn_train_key=hrn_train.item()[key].reshape(-1,32,2) |
|||
hrn_train_key=hrn_train_key[:,re_order,:] |
|||
for cam_idx, kps in enumerate(keypoints[subject][action]): |
|||
cam = dataset.cameras()[subject][cam_idx] |
|||
cameras_name = cam['id'] |
|||
if cameras_name==hr_cam: |
|||
hrn_train_key[..., :2]=normalize_screen_coordinates(hrn_train_key[..., :2], w=cam['res_w'], h=cam['res_h']) |
|||
keypoints[subject][action][cam_idx] = hrn_train_key |
|||
for key in hrn_test.item().keys(): |
|||
subject = 'S'+str(key[0]) |
|||
action = key[2].split('.')[0] |
|||
hr_cam = key[2].split('.')[1] |
|||
hrn_test_key=hrn_test.item()[key].reshape(-1,32,2) |
|||
hrn_test_key=hrn_test_key[:,re_order,:] |
|||
for cam_idx, kps in enumerate(keypoints[subject][action]): |
|||
cam = dataset.cameras()[subject][cam_idx] |
|||
cameras_name = cam['id'] |
|||
# print(key,cam_idx,cameras_name,hr_cam) |
|||
if cameras_name==hr_cam: |
|||
hrn_test_key[..., :2]=normalize_screen_coordinates(hrn_test_key[..., :2], w=cam['res_w'], h=cam['res_h']) |
|||
keypoints[subject][action][cam_idx] = hrn_test_key |
|||
# print(subject,action,cam_idx) |
|||
# exit() |
|||
|
|||
return keypoints |
|||
|
|||
def fetch(self, dataset, subjects, subset=1, parse_3d_poses=True): |
|||
out_poses_3d = {} |
|||
out_poses_2d = {} |
|||
out_camera_params = {} |
|||
# print(dataset['S9']['Directions']['positions_3d'][0][199,:,:]) |
|||
# exit() |
|||
for subject in subjects: |
|||
for action in self.keypoints[subject].keys(): |
|||
if self.action_filter is not None: |
|||
found = False |
|||
for a in self.action_filter: |
|||
if action.startswith(a): |
|||
found = True |
|||
break |
|||
if not found: |
|||
continue |
|||
|
|||
poses_2d = self.keypoints[subject][action] |
|||
#print(action) |
|||
|
|||
for i in range(len(poses_2d)): |
|||
out_poses_2d[(subject, action, i)] = poses_2d[i] |
|||
|
|||
if subject in dataset.cameras(): |
|||
cams = dataset.cameras()[subject] |
|||
assert len(cams) == len(poses_2d), 'Camera count mismatch' |
|||
for i, cam in enumerate(cams): |
|||
if 'intrinsic' in cam: |
|||
out_camera_params[(subject, action, i)] = cam['intrinsic'] |
|||
|
|||
if parse_3d_poses and 'positions_3d' in dataset[subject][action]: |
|||
poses_3d = dataset[subject][action]['positions_3d'] |
|||
assert len(poses_3d) == len(poses_2d), 'Camera count mismatch' |
|||
for i in range(len(poses_3d)): |
|||
out_poses_3d[(subject, action, i)] = poses_3d[i] |
|||
|
|||
if len(out_camera_params) == 0: |
|||
out_camera_params = None |
|||
if len(out_poses_3d) == 0: |
|||
out_poses_3d = None |
|||
|
|||
stride = self.downsample |
|||
if subset < 1: |
|||
for key in out_poses_2d.keys(): |
|||
n_frames = int(round(len(out_poses_2d[key]) // stride * subset) * stride) |
|||
start = deterministic_random(0, len(out_poses_2d[key]) - n_frames + 1, str(len(out_poses_2d[key]))) |
|||
out_poses_2d[key] = out_poses_2d[key][start:start + n_frames:stride] |
|||
if out_poses_3d is not None: |
|||
out_poses_3d[key] = out_poses_3d[key][start:start + n_frames:stride] |
|||
elif stride > 1: |
|||
for key in out_poses_2d.keys(): |
|||
out_poses_2d[key] = out_poses_2d[key][::stride] |
|||
if out_poses_3d is not None: |
|||
out_poses_3d[key] = out_poses_3d[key][::stride] |
|||
|
|||
return out_camera_params, out_poses_3d, out_poses_2d |
|||
|
|||
def __len__(self): |
|||
return len(self.generator.pairs) |
|||
#return 200 |
|||
|
|||
def __getitem__(self, index): |
|||
seq_name, start_3d, end_3d, flip, reverse = self.generator.pairs[index] |
|||
|
|||
if self.MAE: |
|||
cam, input_2D, action, subject, cam_ind = self.generator.get_batch(seq_name, start_3d, end_3d, flip, |
|||
reverse) |
|||
if self.train == False and self.test_aug: |
|||
_, input_2D_aug, _, _,_ = self.generator.get_batch(seq_name, start_3d, end_3d, flip=True, reverse=reverse) |
|||
input_2D = np.concatenate((np.expand_dims(input_2D,axis=0),np.expand_dims(input_2D_aug,axis=0)),0) |
|||
else: |
|||
cam, gt_3D, input_2D, action, subject, cam_ind = self.generator.get_batch(seq_name, start_3d, end_3d, flip, reverse) |
|||
|
|||
if self.train == False and self.test_aug: |
|||
_, _, input_2D_aug, _, _,_ = self.generator.get_batch(seq_name, start_3d, end_3d, flip=True, reverse=reverse) |
|||
input_2D = np.concatenate((np.expand_dims(input_2D,axis=0),np.expand_dims(input_2D_aug,axis=0)),0) |
|||
|
|||
bb_box = np.array([0, 0, 1, 1]) |
|||
input_2D_update = input_2D |
|||
|
|||
scale = np.float(1.0) |
|||
|
|||
if self.MAE: |
|||
return cam, input_2D_update, action, subject, scale, bb_box, cam_ind |
|||
else: |
|||
return cam, gt_3D, input_2D_update, action, subject, scale, bb_box, cam_ind |
|||
|
|||
|
|||
|
@ -0,0 +1,35 @@ |
|||
|
|||
|
|||
class MocapDataset: |
|||
def __init__(self, fps, skeleton): |
|||
self._skeleton = skeleton |
|||
self._fps = fps |
|||
self._data = None |
|||
self._cameras = None |
|||
|
|||
def remove_joints(self, joints_to_remove): |
|||
kept_joints = self._skeleton.remove_joints(joints_to_remove) |
|||
for subject in self._data.keys(): |
|||
for action in self._data[subject].keys(): |
|||
s = self._data[subject][action] |
|||
s['positions'] = s['positions'][:, kept_joints] |
|||
|
|||
def __getitem__(self, key): |
|||
return self._data[key] |
|||
|
|||
def subjects(self): |
|||
return self._data.keys() |
|||
|
|||
def fps(self): |
|||
return self._fps |
|||
|
|||
def skeleton(self): |
|||
return self._skeleton |
|||
|
|||
def cameras(self): |
|||
return self._cameras |
|||
|
|||
def supports_semi_supervised(self): |
|||
return False |
|||
|
|||
|
@ -0,0 +1,137 @@ |
|||
import argparse |
|||
import os |
|||
import math |
|||
import time |
|||
import torch |
|||
import torch.nn as nn |
|||
|
|||
class opts(): |
|||
def __init__(self): |
|||
self.parser = argparse.ArgumentParser() |
|||
|
|||
def init(self): |
|||
self.parser.add_argument('--layers', default=6, type=int) |
|||
self.parser.add_argument('--channel', default=256, type=int) |
|||
self.parser.add_argument('--d_hid', default=192, type=int) # 嵌入维度 |
|||
self.parser.add_argument('--dataset', type=str, default='h36m') |
|||
self.parser.add_argument('-k', '--keypoints', default='cpn_ft_h36m_dbb', type=str) |
|||
self.parser.add_argument('--data_augmentation', type=bool, default=True) |
|||
self.parser.add_argument('--reverse_augmentation', type=bool, default=False) |
|||
self.parser.add_argument('--test_augmentation', type=bool, default=True) |
|||
self.parser.add_argument('--crop_uv', type=int, default=0) |
|||
self.parser.add_argument('--root_path', type=str, default='./dataset/') |
|||
self.parser.add_argument('-a', '--actions', default='*', type=str) |
|||
self.parser.add_argument('--downsample', default=1, type=int) |
|||
self.parser.add_argument('--subset', default=1, type=float) |
|||
self.parser.add_argument('-s', '--stride', default=1, type=int) |
|||
self.parser.add_argument('--gpu', default='1', type=str, help='') |
|||
self.parser.add_argument('--train', type=int, default=0) |
|||
self.parser.add_argument('--test', type=int, default=1) |
|||
self.parser.add_argument('--nepoch', type=int, default=80) |
|||
self.parser.add_argument('-b','--batchSize', type=int, default=1024) |
|||
self.parser.add_argument('--lr', type=float, default=1e-3) |
|||
self.parser.add_argument('--lr_refine', type=float, default=1e-5) |
|||
self.parser.add_argument('--lr_decay_large', type=float, default=0.5) |
|||
self.parser.add_argument('--large_decay_epoch', type=int, default=80) |
|||
self.parser.add_argument('--workers', type=int, default=8) |
|||
self.parser.add_argument('-lrd', '--lr_decay', default=0.96, type=float) |
|||
self.parser.add_argument('-f','--frames', type=int, default=243) |
|||
self.parser.add_argument('--pad', type=int, default=121) |
|||
self.parser.add_argument('--refine', action='store_true') |
|||
self.parser.add_argument('--reload', type=int, default=0) # 是否加载预训练模型 |
|||
self.parser.add_argument('--refine_reload', type=int, default=0) |
|||
self.parser.add_argument('-c','--checkpoint', type=str, default='model') |
|||
self.parser.add_argument('--previous_dir', type=str, default='') |
|||
self.parser.add_argument('--n_joints', type=int, default=17) |
|||
self.parser.add_argument('--out_joints', type=int, default=17) |
|||
self.parser.add_argument('--out_all', type=int, default=1) |
|||
self.parser.add_argument('--in_channels', type=int, default=2) |
|||
self.parser.add_argument('--out_channels', type=int, default=3) |
|||
self.parser.add_argument('-previous_best_threshold', type=float, default= math.inf) |
|||
self.parser.add_argument('-previous_name', type=str, default='') |
|||
self.parser.add_argument('--previous_refine_name', type=str, default='') |
|||
self.parser.add_argument('--manualSeed', type=int, default=1) |
|||
|
|||
self.parser.add_argument('--MAE', action='store_true') |
|||
self.parser.add_argument('-tmr','--temporal_mask_rate', type=float, default=0) |
|||
self.parser.add_argument('-smn', '--spatial_mask_num', type=int, default=0) |
|||
self.parser.add_argument('-tds', '--t_downsample', type=int, default=3) |
|||
|
|||
self.parser.add_argument('--MAE_reload', type=int, default=0) |
|||
self.parser.add_argument('-r', '--resume', action='store_true') |
|||
|
|||
self.parser.add_argument('-mt', '--model_type', type=str, default='') |
|||
self.parser.add_argument('--amp', type=int, default=0) |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
def parse(self): |
|||
self.init() |
|||
self.opt = self.parser.parse_args() |
|||
|
|||
self.opt.pad = (self.opt.frames-1) // 2 |
|||
|
|||
self.opt.part_list = [ |
|||
[8, 9, 10], # 头 |
|||
[0, 7, 8], # 身体 |
|||
[11, 12, 13], # 左手 |
|||
[14, 15, 16], # 右手 |
|||
[4, 5, 6], # 左腿 |
|||
[1, 2, 3] #右腿 |
|||
] |
|||
|
|||
stride_num = { |
|||
'9': [1, 3, 3], |
|||
'27': [3, 3, 3], |
|||
'351': [3, 9, 13], |
|||
'81': [3, 3, 3, 3], |
|||
'243': [3, 3, 3, 3, 3], |
|||
} |
|||
|
|||
if str(self.opt.frames) in stride_num: |
|||
self.opt.stride_num = stride_num[str(self.opt.frames)] |
|||
else: |
|||
self.opt.stride_num = None |
|||
print('no stride_num') |
|||
exit() |
|||
|
|||
self.opt.subjects_train = 'S1,S5,S6,S7,S8' |
|||
self.opt.subjects_test = 'S9,S11' |
|||
#self.opt.subjects_test = 'S11' |
|||
|
|||
#if self.opt.train: |
|||
logtime = time.strftime('%m%d_%H%M_%S_') |
|||
|
|||
ckp_suffix = '' |
|||
if self.opt.refine: |
|||
ckp_suffix='_refine' |
|||
elif self.opt.MAE: |
|||
ckp_suffix = '_pretrain' |
|||
else: |
|||
ckp_suffix = '_STCFormer' |
|||
self.opt.checkpoint = 'checkpoint/'+self.opt.checkpoint + '_%d'%(self.opt.pad*2+1) + \ |
|||
'_%s'%self.opt.model_type |
|||
|
|||
if not os.path.exists(self.opt.checkpoint): |
|||
os.makedirs(self.opt.checkpoint) |
|||
|
|||
if self.opt.train: |
|||
args = dict((name, getattr(self.opt, name)) for name in dir(self.opt) |
|||
if not name.startswith('_')) |
|||
|
|||
file_name = os.path.join(self.opt.checkpoint, 'opt.txt') |
|||
with open(file_name, 'wt') as opt_file: |
|||
opt_file.write('==> Args:\n') |
|||
for k, v in sorted(args.items()): |
|||
opt_file.write(' %s: %s\n' % (str(k), str(v))) |
|||
opt_file.write('==> Args:\n') |
|||
|
|||
return self.opt |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,83 @@ |
|||
|
|||
import numpy as np |
|||
|
|||
|
|||
class Skeleton: |
|||
def __init__(self, parents, joints_left, joints_right): |
|||
assert len(joints_left) == len(joints_right) |
|||
|
|||
self._parents = np.array(parents) |
|||
self._joints_left = joints_left |
|||
self._joints_right = joints_right |
|||
self._compute_metadata() |
|||
|
|||
def num_joints(self): |
|||
return len(self._parents) |
|||
|
|||
def parents(self): |
|||
return self._parents |
|||
|
|||
def has_children(self): |
|||
return self._has_children |
|||
|
|||
def children(self): |
|||
return self._children |
|||
|
|||
def remove_joints(self, joints_to_remove): |
|||
|
|||
valid_joints = [] |
|||
for joint in range(len(self._parents)): |
|||
if joint not in joints_to_remove: |
|||
valid_joints.append(joint) |
|||
|
|||
for i in range(len(self._parents)): |
|||
while self._parents[i] in joints_to_remove: |
|||
self._parents[i] = self._parents[self._parents[i]] |
|||
|
|||
index_offsets = np.zeros(len(self._parents), dtype=int) |
|||
new_parents = [] |
|||
for i, parent in enumerate(self._parents): |
|||
if i not in joints_to_remove: |
|||
new_parents.append(parent - index_offsets[parent]) |
|||
else: |
|||
index_offsets[i:] += 1 |
|||
self._parents = np.array(new_parents) |
|||
|
|||
if self._joints_left is not None: |
|||
new_joints_left = [] |
|||
for joint in self._joints_left: |
|||
if joint in valid_joints: |
|||
new_joints_left.append(joint - index_offsets[joint]) |
|||
self._joints_left = new_joints_left |
|||
if self._joints_right is not None: |
|||
new_joints_right = [] |
|||
for joint in self._joints_right: |
|||
if joint in valid_joints: |
|||
new_joints_right.append(joint - index_offsets[joint]) |
|||
self._joints_right = new_joints_right |
|||
|
|||
self._compute_metadata() |
|||
|
|||
return valid_joints |
|||
|
|||
def joints_left(self): |
|||
return self._joints_left |
|||
|
|||
def joints_right(self): |
|||
return self._joints_right |
|||
|
|||
def _compute_metadata(self): |
|||
self._has_children = np.zeros(len(self._parents)).astype(bool) |
|||
for i, parent in enumerate(self._parents): |
|||
if parent != -1: |
|||
self._has_children[parent] = True |
|||
|
|||
self._children = [] |
|||
for i, parent in enumerate(self._parents): |
|||
self._children.append([]) |
|||
for i, parent in enumerate(self._parents): |
|||
if parent != -1: |
|||
self._children[parent].append(i) |
|||
|
|||
|
|||
|
@ -0,0 +1,235 @@ |
|||
import torch |
|||
import numpy as np |
|||
import hashlib |
|||
from torch.autograd import Variable |
|||
import os |
|||
import torch.nn as nn |
|||
|
|||
def deterministic_random(min_value, max_value, data): |
|||
digest = hashlib.sha256(data.encode()).digest() |
|||
raw_value = int.from_bytes(digest[:4], byteorder='little', signed=False) |
|||
return int(raw_value / (2 ** 32 - 1) * (max_value - min_value)) + min_value |
|||
|
|||
|
|||
def mpjpe_cal(predicted, target): |
|||
assert predicted.shape == target.shape |
|||
return torch.mean(torch.norm(predicted - target, dim=len(target.shape) - 1)) |
|||
|
|||
|
|||
|
|||
def mpjpe_cal_l1(predicted, target): |
|||
assert predicted.shape == target.shape |
|||
criterion = nn.L1Loss(reduction='mean') |
|||
# return torch.mean(torch.norm(predicted - target, dim=len(target.shape) - 1)) |
|||
return criterion(predicted,target) |
|||
|
|||
def test_calculation(predicted, target, action, error_sum, data_type, subject, MAE=False): |
|||
error_sum = mpjpe_by_action_p1(predicted, target, action, error_sum) |
|||
if not MAE: |
|||
error_sum = mpjpe_by_action_p2(predicted, target, action, error_sum) |
|||
|
|||
return error_sum |
|||
|
|||
|
|||
def mpjpe_by_action_p1(predicted, target, action, action_error_sum): |
|||
assert predicted.shape == target.shape |
|||
batch_num = predicted.size(0) |
|||
frame_num = predicted.size(1) |
|||
dist = torch.mean(torch.norm(predicted - target, dim=len(target.shape) - 1), dim=len(target.shape) - 2) |
|||
|
|||
if len(set(list(action))) == 1: |
|||
end_index = action[0].find(' ') |
|||
if end_index != -1: |
|||
action_name = action[0][:end_index] |
|||
else: |
|||
action_name = action[0] |
|||
|
|||
action_error_sum[action_name]['p1'].update(torch.mean(dist).item()*batch_num*frame_num, batch_num*frame_num) |
|||
else: |
|||
for i in range(batch_num): |
|||
end_index = action[i].find(' ') |
|||
if end_index != -1: |
|||
action_name = action[i][:end_index] |
|||
else: |
|||
action_name = action[i] |
|||
|
|||
action_error_sum[action_name]['p1'].update(torch.mean(dist[i]).item()*frame_num, frame_num) |
|||
|
|||
return action_error_sum |
|||
|
|||
|
|||
def mpjpe_by_action_p2(predicted, target, action, action_error_sum): |
|||
assert predicted.shape == target.shape |
|||
num = predicted.size(0) |
|||
pred = predicted.detach().cpu().numpy().reshape(-1, predicted.shape[-2], predicted.shape[-1]) |
|||
gt = target.detach().cpu().numpy().reshape(-1, target.shape[-2], target.shape[-1]) |
|||
dist = p_mpjpe(pred, gt) |
|||
if len(set(list(action))) == 1: |
|||
end_index = action[0].find(' ') |
|||
if end_index != -1: |
|||
action_name = action[0][:end_index] |
|||
else: |
|||
action_name = action[0] |
|||
action_error_sum[action_name]['p2'].update(np.mean(dist) * num, num) |
|||
else: |
|||
for i in range(num): |
|||
end_index = action[i].find(' ') |
|||
if end_index != -1: |
|||
action_name = action[i][:end_index] |
|||
else: |
|||
action_name = action[i] |
|||
action_error_sum[action_name]['p2'].update(np.mean(dist), 1) |
|||
|
|||
return action_error_sum |
|||
|
|||
|
|||
def p_mpjpe(predicted, target): |
|||
assert predicted.shape == target.shape |
|||
|
|||
muX = np.mean(target, axis=1, keepdims=True) |
|||
muY = np.mean(predicted, axis=1, keepdims=True) |
|||
|
|||
X0 = target - muX |
|||
Y0 = predicted - muY |
|||
|
|||
normX = np.sqrt(np.sum(X0 ** 2, axis=(1, 2), keepdims=True)) |
|||
normY = np.sqrt(np.sum(Y0 ** 2, axis=(1, 2), keepdims=True)) |
|||
|
|||
X0 /= normX |
|||
Y0 /= normY |
|||
|
|||
H = np.matmul(X0.transpose(0, 2, 1), Y0) |
|||
U, s, Vt = np.linalg.svd(H) |
|||
V = Vt.transpose(0, 2, 1) |
|||
R = np.matmul(V, U.transpose(0, 2, 1)) |
|||
|
|||
sign_detR = np.sign(np.expand_dims(np.linalg.det(R), axis=1)) |
|||
V[:, :, -1] *= sign_detR |
|||
s[:, -1] *= sign_detR.flatten() |
|||
R = np.matmul(V, U.transpose(0, 2, 1)) |
|||
|
|||
tr = np.expand_dims(np.sum(s, axis=1, keepdims=True), axis=2) |
|||
|
|||
a = tr * normX / normY |
|||
t = muX - a * np.matmul(muY, R) |
|||
|
|||
predicted_aligned = a * np.matmul(predicted, R) + t |
|||
|
|||
return np.mean(np.linalg.norm(predicted_aligned - target, axis=len(target.shape) - 1), axis=len(target.shape) - 2) |
|||
|
|||
|
|||
def define_actions( action ): |
|||
|
|||
actions = ["Directions","Discussion","Eating","Greeting", |
|||
"Phoning","Photo","Posing","Purchases", |
|||
"Sitting","SittingDown","Smoking","Waiting", |
|||
"WalkDog","Walking","WalkTogether"] |
|||
|
|||
if action == "All" or action == "all" or action == '*': |
|||
return actions |
|||
|
|||
if not action in actions: |
|||
raise( ValueError, "Unrecognized action: %s" % action ) |
|||
|
|||
return [action] |
|||
|
|||
|
|||
def define_error_list(actions): |
|||
error_sum = {} |
|||
error_sum.update({actions[i]: {'p1':AccumLoss(), 'p2':AccumLoss()} for i in range(len(actions))}) |
|||
return error_sum |
|||
|
|||
|
|||
class AccumLoss(object): |
|||
def __init__(self): |
|||
self.val = 0 |
|||
self.avg = 0 |
|||
self.sum = 0 |
|||
self.count = 0 |
|||
|
|||
def update(self, val, n=1): |
|||
self.val = val |
|||
self.sum += val |
|||
self.count += n |
|||
self.avg = self.sum / self.count |
|||
|
|||
|
|||
def get_varialbe(split, target): |
|||
num = len(target) |
|||
var = [] |
|||
if split == 'train': |
|||
for i in range(num): |
|||
temp = Variable(target[i], requires_grad=False).contiguous().type(torch.cuda.FloatTensor) |
|||
var.append(temp) |
|||
else: |
|||
for i in range(num): |
|||
temp = Variable(target[i]).contiguous().cuda().type(torch.cuda.FloatTensor) |
|||
var.append(temp) |
|||
|
|||
return var |
|||
|
|||
|
|||
def print_error(data_type, action_error_sum, is_train): |
|||
mean_error_p1, mean_error_p2 = print_error_action(action_error_sum, is_train) |
|||
|
|||
return mean_error_p1, mean_error_p2 |
|||
|
|||
|
|||
def print_error_action(action_error_sum, is_train): |
|||
mean_error_each = {'p1': 0.0, 'p2': 0.0} |
|||
mean_error_all = {'p1': AccumLoss(), 'p2': AccumLoss()} |
|||
|
|||
if is_train == 0: |
|||
print("{0:=^12} {1:=^10} {2:=^8}".format("Action", "p#1 mm", "p#2 mm")) |
|||
|
|||
for action, value in action_error_sum.items(): |
|||
if is_train == 0: |
|||
print("{0:<12} ".format(action), end="") |
|||
|
|||
mean_error_each['p1'] = action_error_sum[action]['p1'].avg * 1000.0 |
|||
mean_error_all['p1'].update(mean_error_each['p1'], 1) |
|||
|
|||
mean_error_each['p2'] = action_error_sum[action]['p2'].avg * 1000.0 |
|||
mean_error_all['p2'].update(mean_error_each['p2'], 1) |
|||
|
|||
if is_train == 0: |
|||
print("{0:>6.2f} {1:>10.2f}".format(mean_error_each['p1'], mean_error_each['p2'])) |
|||
|
|||
if is_train == 0: |
|||
print("{0:<12} {1:>6.2f} {2:>10.2f}".format("Average", mean_error_all['p1'].avg, \ |
|||
mean_error_all['p2'].avg)) |
|||
|
|||
return mean_error_all['p1'].avg, mean_error_all['p2'].avg |
|||
|
|||
|
|||
def save_model(previous_name, save_dir,epoch, data_threshold, model, model_name): |
|||
# if os.path.exists(previous_name): |
|||
# os.remove(previous_name) |
|||
|
|||
torch.save(model.state_dict(), |
|||
'%s/%s_%d_%d.pth' % (save_dir, model_name, epoch, data_threshold * 100)) |
|||
|
|||
previous_name = '%s/%s_%d_%d.pth' % (save_dir, model_name, epoch, data_threshold * 100) |
|||
return previous_name |
|||
|
|||
def save_model_new(save_dir,epoch, data_threshold, lr, optimizer, model, model_name): |
|||
# if os.path.exists(previous_name): |
|||
# os.remove(previous_name) |
|||
|
|||
# torch.save(model.state_dict(), |
|||
# '%s/%s_%d_%d.pth' % (save_dir, model_name, epoch, data_threshold * 100)) |
|||
torch.save({ |
|||
'epoch': epoch, |
|||
'lr': lr, |
|||
'optimizer': optimizer.state_dict(), |
|||
'model_pos': model.state_dict(), |
|||
}, |
|||
'%s/%s_%d_%d.pth' % (save_dir, model_name, epoch, data_threshold * 100)) |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,24 @@ |
|||
|
|||
|
|||
def mpii_get_sequence_info(subject_id, sequence): |
|||
|
|||
switcher = { |
|||
"1 1": [6416,25], |
|||
"1 2": [12430,50], |
|||
"2 1": [6502,25], |
|||
"2 2": [6081,25], |
|||
"3 1": [12488,50], |
|||
"3 2": [12283,50], |
|||
"4 1": [6171,25], |
|||
"4 2": [6675,25], |
|||
"5 1": [12820,50], |
|||
"5 2": [12312,50], |
|||
"6 1": [6188,25], |
|||
"6 2": [6145,25], |
|||
"7 1": [6239,25], |
|||
"7 2": [6320,25], |
|||
"8 1": [6468,25], |
|||
"8 2": [6054,25], |
|||
|
|||
} |
|||
return switcher.get(subject_id+" "+sequence) |
@ -0,0 +1,689 @@ |
|||
# Copyright (c) 2018-present, Facebook, Inc. |
|||
# All rights reserved. |
|||
# |
|||
# This source code is licensed under the license found in the |
|||
# LICENSE file in the root directory of this source tree. |
|||
# |
|||
|
|||
import matplotlib |
|||
|
|||
matplotlib.use('Agg') |
|||
|
|||
import matplotlib.pyplot as plt |
|||
from matplotlib.animation import FuncAnimation, writers |
|||
from mpl_toolkits.mplot3d import Axes3D |
|||
import numpy as np |
|||
import subprocess as sp |
|||
|
|||
|
|||
def get_resolution(filename): |
|||
command = ['ffprobe', '-v', 'error', '-select_streams', 'v:0', |
|||
'-show_entries', 'stream=width,height', '-of', 'csv=p=0', filename] |
|||
with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe: |
|||
for line in pipe.stdout: |
|||
w, h = line.decode().strip().split(',') |
|||
return int(w), int(h) |
|||
|
|||
|
|||
def get_fps(filename): |
|||
command = ['ffprobe', '-v', 'error', '-select_streams', 'v:0', |
|||
'-show_entries', 'stream=r_frame_rate', '-of', 'csv=p=0', filename] |
|||
with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe: |
|||
for line in pipe.stdout: |
|||
a, b = line.decode().strip().split('/') |
|||
return int(a) / int(b) |
|||
|
|||
|
|||
def read_video(filename, skip=0, limit=-1): |
|||
w, h = get_resolution(filename) |
|||
|
|||
command = ['ffmpeg', |
|||
'-i', filename, |
|||
'-f', 'image2pipe', |
|||
'-pix_fmt', 'rgb24', |
|||
'-vsync', '0', |
|||
'-vcodec', 'rawvideo', '-'] |
|||
|
|||
i = 0 |
|||
with sp.Popen(command, stdout=sp.PIPE, bufsize=-1) as pipe: |
|||
while True: |
|||
data = pipe.stdout.read(w * h * 3) |
|||
if not data: |
|||
break |
|||
i += 1 |
|||
if i > limit and limit != -1: |
|||
continue |
|||
if i > skip: |
|||
yield np.frombuffer(data, dtype='uint8').reshape((h, w, 3)) |
|||
|
|||
|
|||
def downsample_tensor(X, factor): |
|||
length = X.shape[0] // factor * factor |
|||
return np.mean(X[:length].reshape(-1, factor, *X.shape[1:]), axis=1) |
|||
|
|||
|
|||
def render_animation(keypoints, keypoints_metadata, poses, skeleton, fps, bitrate, azim, output, viewport, |
|||
limit=-1, downsample=1, size=6, input_video_path=None, input_video_skip=0, viz_action="", |
|||
viz_subject=""): |
|||
""" |
|||
TODO |
|||
Render an animation. The supported output modes are: |
|||
-- 'interactive': display an interactive figure |
|||
(also works on notebooks if associated with %matplotlib inline) |
|||
-- 'html': render the animation as HTML5 video. Can be displayed in a notebook using HTML(...). |
|||
-- 'filename.mp4': render and export the animation as an h264 video (requires ffmpeg). |
|||
-- 'filename.gif': render and export the animation a gif file (requires imagemagick). |
|||
""" |
|||
|
|||
plt.ioff() |
|||
fig = plt.figure(figsize=(size * (1 + len(poses)), size)) |
|||
ax_in = fig.add_subplot(1, 1 + len(poses), 1) |
|||
ax_in.get_xaxis().set_visible(False) |
|||
ax_in.get_yaxis().set_visible(False) |
|||
ax_in.set_axis_off() |
|||
ax_in.set_title('Input') |
|||
|
|||
ax_3d = [] |
|||
lines_3d = [] |
|||
trajectories = [] |
|||
radius = 1.7 |
|||
for index, (title, data) in enumerate(poses.items()): |
|||
ax = fig.add_subplot(1, 1 + len(poses), index + 2, projection='3d') |
|||
ax.view_init(elev=15., azim=azim+90.) |
|||
ax.set_xlim3d([-radius / 2, radius / 2]) |
|||
ax.set_zlim3d([0, radius]) |
|||
ax.set_ylim3d([-radius / 2, radius / 2]) |
|||
# ax.set_aspect('equal') |
|||
ax.set_xticklabels([]) |
|||
ax.set_yticklabels([]) |
|||
ax.set_zticklabels([]) |
|||
ax.dist = 7.5 |
|||
ax.set_title(title) # , pad=35 |
|||
ax_3d.append(ax) |
|||
lines_3d.append([]) |
|||
trajectories.append(data[:, 0, [0, 1]]) |
|||
poses = list(poses.values()) |
|||
|
|||
# Decode video |
|||
if input_video_path is None: |
|||
# Black background |
|||
all_frames = np.zeros((keypoints.shape[0], viewport[1], viewport[0]), dtype='uint8') |
|||
else: |
|||
# Load video using ffmpeg |
|||
all_frames = [] |
|||
for f in read_video(input_video_path, skip=input_video_skip, limit=limit): |
|||
all_frames.append(f) |
|||
effective_length = min(keypoints.shape[0], len(all_frames)) |
|||
all_frames = all_frames[:effective_length] |
|||
|
|||
keypoints = keypoints[input_video_skip:] # todo remove |
|||
for idx in range(len(poses)): |
|||
poses[idx] = poses[idx][input_video_skip:] |
|||
|
|||
if fps is None: |
|||
fps = get_fps(input_video_path) |
|||
|
|||
if downsample > 1: |
|||
keypoints = downsample_tensor(keypoints, downsample) |
|||
all_frames = downsample_tensor(np.array(all_frames), downsample).astype('uint8') |
|||
for idx in range(len(poses)): |
|||
poses[idx] = downsample_tensor(poses[idx], downsample) |
|||
trajectories[idx] = downsample_tensor(trajectories[idx], downsample) |
|||
fps /= downsample |
|||
|
|||
initialized = False |
|||
image = None |
|||
lines = [] |
|||
points = None |
|||
|
|||
if limit < 1: |
|||
limit = len(all_frames) |
|||
else: |
|||
limit = min(limit, len(all_frames)) |
|||
|
|||
parents = skeleton.parents() |
|||
|
|||
def update_video(i): |
|||
nonlocal initialized, image, lines, points |
|||
|
|||
for n, ax in enumerate(ax_3d): |
|||
ax.set_xlim3d([-radius / 2 + trajectories[n][i, 0], radius / 2 + trajectories[n][i, 0]]) |
|||
ax.set_ylim3d([-radius / 2 + trajectories[n][i, 1], radius / 2 + trajectories[n][i, 1]]) |
|||
|
|||
# Update 2D poses |
|||
# joints_right_2d = keypoints_metadata['keypoints_symmetry'][1] |
|||
# joints_left_2d = keypoints_metadata['keypoints_symmetry'][0] |
|||
joints_left_2d = [4, 5, 6, 11, 12, 13] |
|||
joints_right_2d = [1, 2, 3, 14, 15, 16] |
|||
colors_2d = np.full(keypoints.shape[1], 'midnightblue', dtype="object") |
|||
colors_2d[joints_right_2d] = 'yellowgreen' |
|||
colors_2d[joints_left_2d] = 'midnightblue' |
|||
if not initialized: |
|||
image = ax_in.imshow(all_frames[i], aspect='equal') |
|||
|
|||
for j, j_parent in enumerate(parents): |
|||
if j_parent == -1: |
|||
continue |
|||
|
|||
# if len(parents) == keypoints.shape[1] and keypoints_metadata['layout_name'] != 'coco': |
|||
if len(parents) == keypoints.shape[1]: |
|||
# Draw skeleton only if keypoints match (otherwise we don't have the parents definition) |
|||
lines.append(ax_in.plot([keypoints[i, j, 0], keypoints[i, j_parent, 0]], |
|||
[keypoints[i, j, 1], keypoints[i, j_parent, 1]], color=colors_2d[j])) |
|||
|
|||
col = 'red' if j in skeleton.joints_right() else 'black' |
|||
for n, ax in enumerate(ax_3d): |
|||
pos = poses[n][i] |
|||
lines_3d[n].append(ax.plot([pos[j, 0], pos[j_parent, 0]], |
|||
[pos[j, 1], pos[j_parent, 1]], |
|||
[pos[j, 2], pos[j_parent, 2]], zdir='z', c=colors_2d[j])) |
|||
|
|||
# points = ax_in.scatter(*keypoints[i].T, 0, zorder=10) |
|||
|
|||
initialized = True |
|||
else: |
|||
image.set_data(all_frames[i]) |
|||
|
|||
for j, j_parent in enumerate(parents): |
|||
if j_parent == -1: |
|||
continue |
|||
|
|||
# if len(parents) == keypoints.shape[1] and keypoints_metadata['layout_name'] != 'coco': |
|||
if len(parents) == keypoints.shape[1]: |
|||
lines[j - 1][0].set_data([keypoints[i, j, 0], keypoints[i, j_parent, 0]], |
|||
[keypoints[i, j, 1], keypoints[i, j_parent, 1]]) |
|||
|
|||
for n, ax in enumerate(ax_3d): |
|||
pos = poses[n][i] |
|||
lines_3d[n][j - 1][0].set_xdata([pos[j, 0], pos[j_parent, 0]]) |
|||
lines_3d[n][j - 1][0].set_ydata([pos[j, 1], pos[j_parent, 1]]) |
|||
lines_3d[n][j - 1][0].set_3d_properties([pos[j, 2], pos[j_parent, 2]], zdir='z') |
|||
|
|||
# points.set_offsets(keypoints[i]) |
|||
|
|||
print('{}/{} '.format(i, limit), end='\r') |
|||
|
|||
fig.tight_layout() |
|||
|
|||
anim = FuncAnimation(fig, update_video, frames=np.arange(0, limit), interval=1000 / fps, repeat=False) |
|||
if output.endswith('.mp4'): |
|||
Writer = writers['ffmpeg'] |
|||
writer = Writer(fps=fps, metadata={}, bitrate=bitrate) |
|||
anim.save(output, writer=writer) |
|||
elif output.endswith('.gif'): |
|||
anim.save(output, dpi=80, writer='imagemagick') |
|||
else: |
|||
raise ValueError('Unsupported output format (only .mp4 and .gif are supported)') |
|||
plt.close() |
|||
|
|||
|
|||
def render_animation_temp(keypoints, keypoints_metadata, poses, skeleton, fps, bitrate, azim, output, viewport, |
|||
limit=-1, downsample=1, size=6, input_video_path=None, input_video_skip=0, viz_action="", |
|||
viz_subject=""): |
|||
""" |
|||
TODO |
|||
Render an animation. The supported output modes are: |
|||
-- 'interactive': display an interactive figure |
|||
(also works on notebooks if associated with %matplotlib inline) |
|||
-- 'html': render the animation as HTML5 video. Can be displayed in a notebook using HTML(...). |
|||
-- 'filename.mp4': render and export the animation as an h264 video (requires ffmpeg). |
|||
-- 'filename.gif': render and export the animation a gif file (requires imagemagick). |
|||
""" |
|||
|
|||
output = output + "_" + viz_subject + "_" + viz_action + ".mp4" |
|||
print(output) |
|||
|
|||
plt.ioff() |
|||
fig = plt.figure(figsize=(size * (1 + len(poses)), size)) |
|||
ax_in = fig.add_subplot(1, 1 + len(poses), 1) |
|||
ax_in.get_xaxis().set_visible(False) |
|||
ax_in.get_yaxis().set_visible(False) |
|||
ax_in.set_axis_off() |
|||
ax_in.set_title('Input') |
|||
|
|||
ax_3d = [] |
|||
lines_3d = [] |
|||
trajectories = [] |
|||
radius = 1.7 |
|||
for index, (title, data) in enumerate(poses.items()): |
|||
ax = fig.add_subplot(1, 1 + len(poses), index + 2, projection='3d') |
|||
ax.view_init(elev=15., azim=azim) |
|||
ax.set_xlim3d([-radius / 2, radius / 2]) |
|||
ax.set_zlim3d([0, radius]) |
|||
ax.set_ylim3d([-radius / 2, radius / 2]) |
|||
# ax.set_aspect('equal') |
|||
ax.set_xticklabels([]) |
|||
ax.set_yticklabels([]) |
|||
ax.set_zticklabels([]) |
|||
ax.dist = 7.5 |
|||
ax.set_title(title) # , pad=35 |
|||
ax_3d.append(ax) |
|||
lines_3d.append([]) |
|||
trajectories.append(data[:, 0, [0, 1]]) |
|||
poses = list(poses.values()) |
|||
|
|||
# Decode video |
|||
if input_video_path is None: |
|||
# Black background |
|||
all_frames = np.zeros((keypoints.shape[0], viewport[1], viewport[0]), dtype='uint8') |
|||
else: |
|||
# Load video using ffmpeg |
|||
all_frames = [] |
|||
for f in read_video(input_video_path, skip=input_video_skip, limit=limit): |
|||
all_frames.append(f) |
|||
effective_length = min(keypoints.shape[0], len(all_frames)) |
|||
all_frames = all_frames[:effective_length] |
|||
|
|||
keypoints = keypoints[input_video_skip:] # todo remove |
|||
for idx in range(len(poses)): |
|||
poses[idx] = poses[idx][input_video_skip:] |
|||
|
|||
if fps is None: |
|||
fps = get_fps(input_video_path) |
|||
|
|||
if downsample > 1: |
|||
keypoints = downsample_tensor(keypoints, downsample) |
|||
all_frames = downsample_tensor(np.array(all_frames), downsample).astype('uint8') |
|||
for idx in range(len(poses)): |
|||
poses[idx] = downsample_tensor(poses[idx], downsample) |
|||
trajectories[idx] = downsample_tensor(trajectories[idx], downsample) |
|||
fps /= downsample |
|||
|
|||
initialized = False |
|||
image = None |
|||
lines = [] |
|||
points = None |
|||
|
|||
if limit < 1: |
|||
limit = len(all_frames) |
|||
else: |
|||
limit = min(limit, len(all_frames)) |
|||
|
|||
parents = skeleton.parents() |
|||
|
|||
def update_video(i): |
|||
nonlocal initialized, image, lines, points |
|||
|
|||
for n, ax in enumerate(ax_3d): |
|||
ax.set_xlim3d([-radius / 2 + trajectories[n][i, 0], radius / 2 + trajectories[n][i, 0]]) |
|||
ax.set_ylim3d([-radius / 2 + trajectories[n][i, 1], radius / 2 + trajectories[n][i, 1]]) |
|||
|
|||
# Update 2D poses |
|||
joints_right_2d = keypoints_metadata['keypoints_symmetry'][1] |
|||
joints_left_2d = keypoints_metadata['keypoints_symmetry'][0] |
|||
colors_2d = np.full(keypoints.shape[1], 'peru', dtype="object") |
|||
colors_2d[joints_right_2d] = 'darkseagreen' |
|||
colors_2d[joints_left_2d] = 'slateblue' |
|||
if not initialized: |
|||
image = ax_in.imshow(all_frames[i], aspect='equal') |
|||
|
|||
for j, j_parent in enumerate(parents): |
|||
if j_parent == -1: |
|||
continue |
|||
|
|||
# if len(parents) == keypoints.shape[1] and keypoints_metadata['layout_name'] != 'coco': |
|||
if len(parents) == keypoints.shape[1]: |
|||
# Draw skeleton only if keypoints match (otherwise we don't have the parents definition) |
|||
lines.append(ax_in.plot([keypoints[i, j, 0], keypoints[i, j_parent, 0]], |
|||
[keypoints[i, j, 1], keypoints[i, j_parent, 1]], color=colors_2d[j])) |
|||
|
|||
col = 'red' if j in skeleton.joints_right() else 'black' |
|||
for n, ax in enumerate(ax_3d): |
|||
pos = poses[n][i] |
|||
lines_3d[n].append(ax.plot([pos[j, 0], pos[j_parent, 0]], |
|||
[pos[j, 1], pos[j_parent, 1]], |
|||
[pos[j, 2], pos[j_parent, 2]], zdir='z', c=colors_2d[j])) |
|||
|
|||
points = ax_in.scatter(*keypoints[i].T, 10, color=colors_2d, edgecolors='white', zorder=10) |
|||
|
|||
initialized = True |
|||
else: |
|||
image.set_data(all_frames[i]) |
|||
|
|||
for j, j_parent in enumerate(parents): |
|||
if j_parent == -1: |
|||
continue |
|||
|
|||
# if len(parents) == keypoints.shape[1] and keypoints_metadata['layout_name'] != 'coco': |
|||
if len(parents) == keypoints.shape[1]: |
|||
lines[j - 1][0].set_data([keypoints[i, j, 0], keypoints[i, j_parent, 0]], |
|||
[keypoints[i, j, 1], keypoints[i, j_parent, 1]]) |
|||
|
|||
for n, ax in enumerate(ax_3d): |
|||
pos = poses[n][i] |
|||
lines_3d[n][j - 1][0].set_xdata([pos[j, 0], pos[j_parent, 0]]) |
|||
lines_3d[n][j - 1][0].set_ydata([pos[j, 1], pos[j_parent, 1]]) |
|||
lines_3d[n][j - 1][0].set_3d_properties([pos[j, 2], pos[j_parent, 2]], zdir='z') |
|||
|
|||
points.set_offsets(keypoints[i]) |
|||
|
|||
print('{}/{} '.format(i, limit), end='\r') |
|||
|
|||
fig.tight_layout() |
|||
|
|||
anim = FuncAnimation(fig, update_video, frames=np.arange(0, limit), interval=1000 / fps, repeat=False) |
|||
if output.endswith('.mp4'): |
|||
Writer = writers['ffmpeg'] |
|||
writer = Writer(fps=fps, metadata={}, bitrate=bitrate) |
|||
anim.save(output, writer=writer) |
|||
elif output.endswith('.gif'): |
|||
anim.save(output, dpi=80, writer='imagemagick') |
|||
else: |
|||
raise ValueError('Unsupported output format (only .mp4 and .gif are supported)') |
|||
plt.close() |
|||
|
|||
|
|||
def render_animation_T(keypoints, keypoints_metadata, poses, skeleton, fps, bitrate, azim, output, |
|||
viewport, |
|||
limit=-1, downsample=1, size=6, input_video_path=None, input_video_skip=0, viz_action="", |
|||
viz_subject=""): |
|||
""" |
|||
TODO |
|||
Render an animation. The supported output modes are: |
|||
-- 'interactive': display an interactive figure |
|||
(also works on notebooks if associated with %matplotlib inline) |
|||
-- 'html': render the animation as HTML5 video. Can be displayed in a notebook using HTML(...). |
|||
-- 'filename.mp4': render and export the animation as an h264 video (requires ffmpeg). |
|||
-- 'filename.gif': render and export the animation a gif file (requires imagemagick). |
|||
""" |
|||
|
|||
output = output + "_" + viz_subject + "_" + viz_action + ".mp4" |
|||
print(output) |
|||
|
|||
plt.ioff() |
|||
fig = plt.figure(figsize=(size * (1 + len(poses)), size)) |
|||
ax_in = fig.add_subplot(1, 1 + len(poses), 1) |
|||
ax_in.get_xaxis().set_visible(False) |
|||
ax_in.get_yaxis().set_visible(False) |
|||
ax_in.set_axis_off() |
|||
ax_in.set_title('Input') |
|||
|
|||
ax_3d = [] |
|||
lines_3d = [] |
|||
trajectories = [] |
|||
radius = 1.7 |
|||
for index, (title, data) in enumerate(poses.items()): |
|||
ax = fig.add_subplot(1, 1 + len(poses), index + 2, projection='3d') |
|||
ax.view_init(elev=15., azim=azim) |
|||
ax.set_xlim3d([-radius / 2, radius / 2]) |
|||
ax.set_zlim3d([0, radius]) |
|||
ax.set_ylim3d([-radius / 2, radius / 2]) |
|||
# ax.set_aspect('equal') |
|||
ax.set_xticklabels([]) |
|||
ax.set_yticklabels([]) |
|||
ax.set_zticklabels([]) |
|||
ax.dist = 7.5 |
|||
ax.set_title(title) # , pad=35 |
|||
ax_3d.append(ax) |
|||
lines_3d.append([]) |
|||
trajectories.append(data[:, 0, [0, 1]]) |
|||
poses = list(poses.values()) |
|||
|
|||
# Decode video |
|||
if input_video_path is None: |
|||
# Black background |
|||
all_frames = np.zeros((keypoints.shape[0], viewport[1], viewport[0]), dtype='uint8') |
|||
else: |
|||
# Load video using ffmpeg |
|||
all_frames = [] |
|||
for f in read_video(input_video_path, skip=input_video_skip, limit=limit): |
|||
all_frames.append(f) |
|||
effective_length = min(keypoints.shape[0], len(all_frames)) |
|||
all_frames = all_frames[:effective_length] |
|||
|
|||
keypoints = keypoints[input_video_skip:] # todo remove |
|||
for idx in range(len(poses)): |
|||
poses[idx] = poses[idx][input_video_skip:] |
|||
|
|||
if fps is None: |
|||
fps = get_fps(input_video_path) |
|||
|
|||
if downsample > 1: |
|||
keypoints = downsample_tensor(keypoints, downsample) |
|||
all_frames = downsample_tensor(np.array(all_frames), downsample).astype('uint8') |
|||
for idx in range(len(poses)): |
|||
poses[idx] = downsample_tensor(poses[idx], downsample) |
|||
trajectories[idx] = downsample_tensor(trajectories[idx], downsample) |
|||
fps /= downsample |
|||
|
|||
initialized = False |
|||
image = None |
|||
lines = [] |
|||
points = None |
|||
|
|||
if limit < 1: |
|||
limit = len(all_frames) |
|||
else: |
|||
limit = min(limit, len(all_frames)) |
|||
|
|||
parents = skeleton.parents() |
|||
|
|||
def update_video(i): |
|||
nonlocal initialized, image, lines, points |
|||
|
|||
for n, ax in enumerate(ax_3d): |
|||
ax.set_xlim3d([-radius / 2 + trajectories[n][i, 0], radius / 2 + trajectories[n][i, 0]]) |
|||
ax.set_ylim3d([-radius / 2 + trajectories[n][i, 1], radius / 2 + trajectories[n][i, 1]]) |
|||
|
|||
# Update 2D poses |
|||
joints_right_2d = keypoints_metadata['keypoints_symmetry'][1] |
|||
joints_left_2d = keypoints_metadata['keypoints_symmetry'][0] |
|||
colors_2d = np.full(keypoints.shape[1], 'peru', dtype="object") |
|||
colors_2d[joints_right_2d] = 'darkseagreen' |
|||
colors_2d[joints_left_2d] = 'slateblue' |
|||
if not initialized: |
|||
image = ax_in.imshow(all_frames[i], aspect='equal') |
|||
|
|||
for j, j_parent in enumerate(parents): |
|||
if j_parent == -1: |
|||
continue |
|||
|
|||
# if len(parents) == keypoints.shape[1] and keypoints_metadata['layout_name'] != 'coco': |
|||
if len(parents) == keypoints.shape[1]: |
|||
# Draw skeleton only if keypoints match (otherwise we don't have the parents definition) |
|||
lines.append(ax_in.plot([keypoints[i, j, 0], keypoints[i, j_parent, 0]], |
|||
[keypoints[i, j, 1], keypoints[i, j_parent, 1]], color=colors_2d[j])) |
|||
|
|||
col = 'red' if j in skeleton.joints_right() else 'black' |
|||
for n, ax in enumerate(ax_3d): |
|||
pos = poses[n][i] |
|||
lines_3d[n].append(ax.plot([pos[j, 0], pos[j_parent, 0]], |
|||
[pos[j, 1], pos[j_parent, 1]], |
|||
[pos[j, 2], pos[j_parent, 2]], zdir='z', c=colors_2d[j])) |
|||
|
|||
# points = ax_in.scatter(*keypoints[i].T, 0, zorder=10) |
|||
|
|||
initialized = True |
|||
else: |
|||
image.set_data(all_frames[i]) |
|||
|
|||
for j, j_parent in enumerate(parents): |
|||
if j_parent == -1: |
|||
continue |
|||
|
|||
# if len(parents) == keypoints.shape[1] and keypoints_metadata['layout_name'] != 'coco': |
|||
if len(parents) == keypoints.shape[1]: |
|||
lines[j - 1][0].set_data([keypoints[i, j, 0], keypoints[i, j_parent, 0]], |
|||
[keypoints[i, j, 1], keypoints[i, j_parent, 1]]) |
|||
|
|||
for n, ax in enumerate(ax_3d): |
|||
pos = poses[n][i] |
|||
lines_3d[n][j - 1][0].set_xdata([pos[j, 0], pos[j_parent, 0]]) |
|||
lines_3d[n][j - 1][0].set_ydata([pos[j, 1], pos[j_parent, 1]]) |
|||
lines_3d[n][j - 1][0].set_3d_properties([pos[j, 2], pos[j_parent, 2]], zdir='z') |
|||
|
|||
# points.set_offsets(keypoints[i]) |
|||
|
|||
print('{}/{} '.format(i, limit), end='\r') |
|||
|
|||
fig.tight_layout() |
|||
|
|||
anim = FuncAnimation(fig, update_video, frames=np.arange(0, limit), interval=1000 / fps, repeat=False) |
|||
if output.endswith('.mp4'): |
|||
Writer = writers['ffmpeg'] |
|||
writer = Writer(fps=fps, metadata={}, bitrate=bitrate) |
|||
anim.save(output, writer=writer) |
|||
elif output.endswith('.gif'): |
|||
anim.save(output, dpi=80, writer='imagemagick') |
|||
else: |
|||
raise ValueError('Unsupported output format (only .mp4 and .gif are supported)') |
|||
plt.close() |
|||
|
|||
|
|||
def render_animation_humaneva(keypoints, keypoints_metadata, poses, skeleton, fps, bitrate, azim, output, |
|||
viewport, |
|||
limit=-1, downsample=1, size=6, input_video_path=None, input_video_skip=0, viz_action="", |
|||
viz_subject=""): |
|||
""" |
|||
TODO |
|||
Render an animation. The supported output modes are: |
|||
-- 'interactive': display an interactive figure |
|||
(also works on notebooks if associated with %matplotlib inline) |
|||
-- 'html': render the animation as HTML5 video. Can be displayed in a notebook using HTML(...). |
|||
-- 'filename.mp4': render and export the animation as an h264 video (requires ffmpeg). |
|||
-- 'filename.gif': render and export the animation a gif file (requires imagemagick). |
|||
""" |
|||
|
|||
# output = output + "_" + viz_subject + "_" + viz_action + ".mp4" |
|||
# print(output) |
|||
|
|||
plt.ioff() |
|||
fig = plt.figure(figsize=(size * (1 + len(poses)), size)) |
|||
ax_in = fig.add_subplot(1, 1 + len(poses), 1) |
|||
ax_in.get_xaxis().set_visible(False) |
|||
ax_in.get_yaxis().set_visible(False) |
|||
ax_in.set_axis_off() |
|||
ax_in.set_title('Input') |
|||
|
|||
ax_3d = [] |
|||
lines_3d = [] |
|||
lines_3d_anno = [] |
|||
trajectories = [] |
|||
radius = 1.7 |
|||
for index, (title, data) in enumerate(poses.items()): |
|||
ax = fig.add_subplot(1, 1 + len(poses), index + 2, projection='3d') |
|||
ax.view_init(elev=15., azim=azim) |
|||
ax.set_xlim3d([-radius / 2, radius / 2]) |
|||
ax.set_zlim3d([0, radius]) |
|||
ax.set_ylim3d([-radius / 2, radius / 2]) |
|||
# ax.set_aspect('equal') |
|||
ax.set_xticklabels([]) |
|||
ax.set_yticklabels([]) |
|||
ax.set_zticklabels([]) |
|||
ax.dist = 7.5 |
|||
ax.set_title(title) # , pad=35 |
|||
ax_3d.append(ax) |
|||
lines_3d.append([]) |
|||
trajectories.append(data[:, 0, [0, 1]]) |
|||
poses = list(poses.values()) |
|||
|
|||
# Decode video |
|||
if input_video_path is None: |
|||
# Black background |
|||
all_frames = np.zeros((keypoints.shape[0], viewport[1], viewport[0]), dtype='uint8') |
|||
else: |
|||
# Load video using ffmpeg |
|||
all_frames = [] |
|||
for f in read_video(input_video_path, skip=input_video_skip, limit=limit): |
|||
all_frames.append(f) |
|||
effective_length = min(keypoints.shape[0], len(all_frames)) |
|||
all_frames = all_frames[:effective_length] |
|||
|
|||
keypoints = keypoints[input_video_skip:] # todo remove |
|||
for idx in range(len(poses)): |
|||
poses[idx] = poses[idx][input_video_skip:] |
|||
|
|||
if fps is None: |
|||
fps = get_fps(input_video_path) |
|||
|
|||
if downsample > 1: |
|||
keypoints = downsample_tensor(keypoints, downsample) |
|||
all_frames = downsample_tensor(np.array(all_frames), downsample).astype('uint8') |
|||
for idx in range(len(poses)): |
|||
poses[idx] = downsample_tensor(poses[idx], downsample) |
|||
trajectories[idx] = downsample_tensor(trajectories[idx], downsample) |
|||
fps /= downsample |
|||
|
|||
initialized = False |
|||
image = None |
|||
lines = [] |
|||
points = None |
|||
|
|||
if limit < 1: |
|||
limit = len(all_frames) |
|||
else: |
|||
limit = min(limit, len(all_frames)) |
|||
|
|||
parents = skeleton.parents() |
|||
|
|||
def update_video(i): |
|||
nonlocal initialized, image, lines, points |
|||
|
|||
for n, ax in enumerate(ax_3d): |
|||
ax.set_xlim3d([-radius / 2 + trajectories[n][i, 0], radius / 2 + trajectories[n][i, 0]]) |
|||
ax.set_ylim3d([-radius / 2 + trajectories[n][i, 1], radius / 2 + trajectories[n][i, 1]]) |
|||
|
|||
# Update 2D poses |
|||
joints_right_2d = keypoints_metadata['keypoints_symmetry'][1] |
|||
joints_left_2d = keypoints_metadata['keypoints_symmetry'][0] |
|||
colors_2d = np.full(keypoints.shape[1], 'peru', dtype="object") |
|||
colors_2d[joints_right_2d] = 'darkseagreen' |
|||
colors_2d[joints_left_2d] = 'slateblue' |
|||
if not initialized: |
|||
image = ax_in.imshow(all_frames[i], aspect='equal') |
|||
|
|||
for j, j_parent in enumerate(parents): |
|||
if j_parent == -1: |
|||
continue |
|||
|
|||
# if len(parents) == keypoints.shape[1] and keypoints_metadata['layout_name'] != 'coco': |
|||
if len(parents) == keypoints.shape[1]: |
|||
# Draw skeleton only if keypoints match (otherwise we don't have the parents definition) |
|||
lines.append(ax_in.plot([keypoints[i, j, 0], keypoints[i, j_parent, 0]], |
|||
[keypoints[i, j, 1], keypoints[i, j_parent, 1]], color=colors_2d[j])) |
|||
|
|||
col = 'red' if j in skeleton.joints_right() else 'black' |
|||
for n, ax in enumerate(ax_3d): |
|||
pos = poses[n][i] |
|||
lines_3d[n].append(ax.plot([pos[j, 0], pos[j_parent, 0]], |
|||
[pos[j, 1], pos[j_parent, 1]], |
|||
[pos[j, 2], pos[j_parent, 2]], zdir='z', c=colors_2d[j])) |
|||
ax.text(pos[j, 0] - 0.1, pos[j, 1] - 0.1, pos[j, 2] - 0.1, j) |
|||
|
|||
# points = ax_in.scatter(*keypoints[i].T, 0, zorder=10) |
|||
|
|||
initialized = True |
|||
else: |
|||
image.set_data(all_frames[i]) |
|||
|
|||
for j, j_parent in enumerate(parents): |
|||
if j_parent == -1: |
|||
continue |
|||
|
|||
# if len(parents) == keypoints.shape[1] and keypoints_metadata['layout_name'] != 'coco': |
|||
if len(parents) == keypoints.shape[1]: |
|||
lines[(j - 1) * 2][0].set_data([keypoints[i, j, 0], keypoints[i, j_parent, 0]], |
|||
[keypoints[i, j, 1], keypoints[i, j_parent, 1]]) |
|||
|
|||
for n, ax in enumerate(ax_3d): |
|||
pos = poses[n][i] |
|||
lines_3d[n][j - 1][0].set_xdata([pos[j, 0], pos[j_parent, 0]]) |
|||
lines_3d[n][j - 1][0].set_ydata([pos[j, 1], pos[j_parent, 1]]) |
|||
lines_3d[n][j - 1][0].set_3d_properties([pos[j, 2], pos[j_parent, 2]], zdir='z') |
|||
|
|||
# points.set_offsets(keypoints[i]) |
|||
|
|||
print('{}/{} '.format(i, limit), end='\r') |
|||
|
|||
fig.tight_layout() |
|||
|
|||
anim = FuncAnimation(fig, update_video, frames=np.arange(0, limit), interval=1000 / fps, repeat=False) |
|||
if output.endswith('.mp4'): |
|||
Writer = writers['ffmpeg'] |
|||
writer = Writer(fps=fps, metadata={}, bitrate=bitrate) |
|||
anim.save(output, writer=writer) |
|||
elif output.endswith('.gif'): |
|||
anim.save(output, dpi=80, writer='imagemagick') |
|||
else: |
|||
raise ValueError('Unsupported output format (only .mp4 and .gif are supported)') |
|||
plt.close() |
@ -0,0 +1,43 @@ |
|||
import time |
|||
|
|||
from nvitop import Device, GpuProcess, NA, colored |
|||
|
|||
print(colored(time.strftime('%a %b %d %H:%M:%S %Y'), color='red', attrs=('bold',))) |
|||
|
|||
devices = Device.cuda.all() # or `Device.all()` to use NVML ordinal instead |
|||
separator = False |
|||
for device in devices: |
|||
processes = device.processes() |
|||
|
|||
print(colored(str(device), color='green', attrs=('bold',))) |
|||
print(colored(' - Fan speed: ', color='blue', attrs=('bold',)) + f'{device.fan_speed()}%') |
|||
print(colored(' - Temperature: ', color='blue', attrs=('bold',)) + f'{device.temperature()}C') |
|||
print(colored(' - GPU utilization: ', color='blue', attrs=('bold',)) + f'{device.gpu_utilization()}%') |
|||
print(colored(' - Total memory: ', color='blue', attrs=('bold',)) + f'{device.memory_total_human()}') |
|||
print(colored(' - Used memory: ', color='blue', attrs=('bold',)) + f'{device.memory_used_human()}') |
|||
print(colored(' - Free memory: ', color='blue', attrs=('bold',)) + f'{device.memory_free_human()}') |
|||
if len(processes) > 0: |
|||
processes = GpuProcess.take_snapshots(processes.values(), failsafe=True) |
|||
processes.sort(key=lambda process: (process.username, process.pid)) |
|||
|
|||
print(colored(f' - Processes ({len(processes)}):', color='blue', attrs=('bold',))) |
|||
fmt = ' {pid:<5} {username:<8} {cpu:>5} {host_memory:>8} {time:>8} {gpu_memory:>8} {sm:>3} {command:<}'.format |
|||
print(colored(fmt(pid='PID', username='USERNAME', |
|||
cpu='CPU%', host_memory='HOST-MEM', time='TIME', |
|||
gpu_memory='GPU-MEM', sm='SM%', |
|||
command='COMMAND'), |
|||
attrs=('bold',))) |
|||
for snapshot in processes: |
|||
print(fmt(pid=snapshot.pid, |
|||
username=snapshot.username[:7] + ('+' if len(snapshot.username) > 8 else snapshot.username[7:8]), |
|||
cpu=snapshot.cpu_percent, host_memory=snapshot.host_memory_human, |
|||
time=snapshot.running_time_human, |
|||
gpu_memory=(snapshot.gpu_memory_human if snapshot.gpu_memory_human is not NA else 'WDDM:N/A'), |
|||
sm=snapshot.gpu_sm_utilization, |
|||
command=snapshot.command)) |
|||
else: |
|||
print(colored(' - No Running Processes', attrs=('bold',))) |
|||
|
|||
if separator: |
|||
print('-' * 120) |
|||
separator = True |
@ -0,0 +1,223 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
# from model.module.trans import Transformer as Transformer_s |
|||
# from model.module.trans_hypothesis import Transformer |
|||
import numpy as np |
|||
from einops import rearrange |
|||
from collections import OrderedDict |
|||
from torch.nn import functional as F |
|||
from torch.nn import init |
|||
import scipy.sparse as sp |
|||
|
|||
from timm.models.layers import DropPath |
|||
|
|||
|
|||
|
|||
class Mlp(nn.Module): |
|||
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.1): |
|||
super().__init__() |
|||
out_features = out_features or in_features |
|||
hidden_features = hidden_features or in_features |
|||
self.fc1 = nn.Linear(in_features, hidden_features, bias=False) |
|||
self.act = act_layer() |
|||
self.fc2 = nn.Linear(hidden_features, out_features, bias=False) |
|||
self.drop = nn.Dropout(drop) |
|||
|
|||
def forward(self, x): |
|||
x = self.fc1(x) |
|||
x = self.act(x) |
|||
x = self.drop(x) |
|||
x = self.fc2(x) |
|||
x = self.drop(x) |
|||
return x |
|||
|
|||
|
|||
class STC_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
# print(d_time, d_joint, d_coor, head) |
|||
self.qkv = nn.Linear(d_coor, d_coor * 3) |
|||
self.head = head |
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.scale = (d_coor // 2) ** -0.5 |
|||
self.proj = nn.Linear(d_coor, d_coor) |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.head = head |
|||
|
|||
# sep1 |
|||
# print(d_coor) |
|||
self.emb = nn.Embedding(5, d_coor//head//2) |
|||
self.part = torch.tensor([0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 3, 3, 3, 4, 4, 4]).long().cuda() |
|||
|
|||
# sep2 |
|||
self.sep2_t = nn.Conv2d(d_coor // 2, d_coor // 2, kernel_size=3, stride=1, padding=1, groups=d_coor // 2) |
|||
# self.sep2_s = nn.Conv2d(d_coor // 2, d_coor // 2, kernel_size=3, stride=1, padding=1, groups=d_coor // 2) |
|||
|
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
|
|||
h = input |
|||
x = self.layer_norm(input) |
|||
|
|||
qkv = self.qkv(x) # b, t, s, c-> b, t, s, 3*c |
|||
qkv = qkv.reshape(b, t, s, c, 3).permute(4, 0, 1, 2, 3) # 3,b,t,s,c |
|||
|
|||
# space group and time group |
|||
qkv_s, qkv_t = qkv.chunk(2, 4) # [3,b,t,s,c//2], [3,b,t,s,c//2] |
|||
|
|||
q_s, k_s, v_s = qkv_s[0], qkv_s[1], qkv_s[2] # b,t,s,c//2 |
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,s,c//2 |
|||
|
|||
# reshape for mat |
|||
q_s = rearrange(q_s, 'b t s (h c) -> (b h t) s c', h=self.head) # b,t,s,c//2-> b*h*t,s,c//2//h |
|||
k_s = rearrange(k_s, 'b t s (h c) -> (b h t) c s ', h=self.head) # b,t,s,c//2-> b*h*t,c//2//h,s |
|||
|
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) # b,t,s,c//2 -> b*h*s,t,c//2//h |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t ', h=self.head) # b,t,s,c//2-> b*h*s,c//2//h,t |
|||
|
|||
att_s = (q_s @ k_s) * self.scale # b*h*t,s,s |
|||
att_t = (q_t @ k_t) * self.scale # b*h*s,t,t |
|||
|
|||
att_s = att_s.softmax(-1) # b*h*t,s,s |
|||
att_t = att_t.softmax(-1) # b*h*s,t,t |
|||
|
|||
v_s = rearrange(v_s, 'b t s c -> b c t s ') |
|||
v_t = rearrange(v_t, 'b t s c -> b c t s ') |
|||
|
|||
# sep2 |
|||
# sep2_s = self.sep2_s(v_s) # b,c//2,t,s |
|||
sep2_t = self.sep2_t(v_t) # b,c//2,t,s |
|||
# sep2_s = rearrange(sep2_s, 'b (h c) t s -> (b h t) s c ', h=self.head) # b*h*t,s,c//2//h |
|||
sep2_t = rearrange(sep2_t, 'b (h c) t s -> (b h s) t c ', h=self.head) # b*h*s,t,c//2//h |
|||
|
|||
# sep1 |
|||
# v_s = rearrange(v_s, 'b c t s -> (b t ) s c') |
|||
# v_t = rearrange(v_t, 'b c t s -> (b s ) t c') |
|||
# print(lep_s.shape) |
|||
# sep_s = self.emb(self.part).unsqueeze(0) # 1,s,c//2//h |
|||
sep_t = self.emb(self.part).unsqueeze(0).unsqueeze(0).unsqueeze(0) # 1,1,1,s,c//2//h |
|||
|
|||
# MSA |
|||
v_s = rearrange(v_s, 'b (h c) t s -> (b h t) s c ', h=self.head) # b*h*t,s,c//2//h |
|||
v_t = rearrange(v_t, 'b (h c) t s -> (b h s) t c ', h=self.head) # b*h*s,t,c//2//h |
|||
|
|||
# x_s = att_s @ v_s + sep2_s + 0.0001 * self.drop(sep_s) # b*h*t,s,c//2//h |
|||
x_s = att_s @ v_s |
|||
x_t = att_t @ v_t + sep2_t # b*h,t,c//h # b*h*s,t,c//2//h |
|||
|
|||
x_s = rearrange(x_s, '(b h t) s c -> b h t s c ', h=self.head, t=t) # b*h*t,s,c//h//2 -> b,h,t,s,c//h//2 |
|||
x_t = rearrange(x_t, '(b h s) t c -> b h t s c ', h=self.head, s=s) # b*h*s,t,c//h//2 -> b,h,t,s,c//h//2 |
|||
|
|||
x_t = x_t + 1e-9 * self.drop(sep_t) |
|||
|
|||
x = torch.cat((x_s, x_t), -1) # b,h,t,s,c//h |
|||
x = rearrange(x, 'b h t s c -> b t s (h c) ') # b,t,s,c |
|||
|
|||
# projection and skip-connection |
|||
x = self.proj(x) |
|||
x = x + h |
|||
return x |
|||
|
|||
|
|||
class STC_BLOCK(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor): |
|||
super().__init__() |
|||
|
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.mlp = Mlp(d_coor, d_coor * 4, d_coor) |
|||
|
|||
self.stc_att = STC_ATTENTION(d_time, d_joint, d_coor) |
|||
self.drop = DropPath(0.0) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
x = self.stc_att(input) |
|||
x = x + self.drop(self.mlp(self.layer_norm(x))) |
|||
|
|||
return x |
|||
|
|||
|
|||
class STCFormer(nn.Module): |
|||
def __init__(self, num_block, d_time, d_joint, d_coor): |
|||
super(STCFormer, self).__init__() |
|||
|
|||
self.num_block = num_block |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.d_coor = d_coor |
|||
|
|||
self.stc_block = [] |
|||
for l in range(self.num_block): |
|||
self.stc_block.append(STC_BLOCK(self.d_time, self.d_joint, self.d_coor)) |
|||
self.stc_block = nn.ModuleList(self.stc_block) |
|||
|
|||
def forward(self, input): |
|||
# blocks layers |
|||
for i in range(self.num_block): |
|||
input = self.stc_block[i](input) |
|||
# exit() |
|||
return input |
|||
|
|||
|
|||
class Model(nn.Module): |
|||
def __init__(self, args): |
|||
super().__init__() |
|||
|
|||
layers, d_hid, frames = args.layers, args.d_hid, args.frames |
|||
num_joints_in, num_joints_out = args.n_joints, args.out_joints |
|||
|
|||
# layers, length, d_hid = layers, frames, d_hid |
|||
# num_joints_in, num_joints_out = 17,17 |
|||
|
|||
self.pose_emb = nn.Linear(2, d_hid, bias=False) |
|||
self.gelu = nn.GELU() |
|||
self.stcformer = STCFormer(layers, frames, num_joints_in, d_hid) |
|||
self.regress_head = nn.Linear(d_hid, 3, bias=False) |
|||
|
|||
def forward(self, x): |
|||
# b, t, s, c = x.shape #batch,frame,joint,coordinate |
|||
# dimension tranfer |
|||
x = self.pose_emb(x) |
|||
x = self.gelu(x) |
|||
# spatio-temporal correlation |
|||
x = self.stcformer(x) |
|||
# regression head |
|||
x = self.regress_head(x) |
|||
|
|||
return x |
|||
|
|||
class Args: |
|||
def __init__(self, layers, d_hid, frames, n_joints, out_joints): |
|||
self.layers = layers |
|||
self.d_hid = d_hid |
|||
self.frames = frames |
|||
self.n_joints = n_joints |
|||
self.out_joints = out_joints |
|||
|
|||
if __name__ == "__main__": |
|||
# inputs = torch.rand(64, 351, 34) # [btz, channel, T, H, W] |
|||
# inputs = torch.rand(1, 64, 4, 112, 112) #[btz, channel, T, H, W] |
|||
args = Args(layers=6, d_hid=256, frames=27, n_joints=17, out_joints=17) |
|||
net = Model(args) |
|||
inputs = torch.rand([1, 27, 17, 2]) |
|||
if torch.cuda.is_available(): |
|||
net = net.cuda() |
|||
inputs = inputs.cuda() |
|||
output = net(inputs) |
|||
print(output.size()) |
|||
|
|||
from thop import profile |
|||
# flops = 2*macs, 计算模型的计算量和参数量 |
|||
macs, params = profile(net, inputs=(inputs,)) |
|||
print(2*macs) |
|||
print(params) |
@ -0,0 +1,36 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
from torch.autograd import Variable |
|||
|
|||
fc_out = 256 |
|||
fc_unit = 1024 |
|||
|
|||
class refine(nn.Module): |
|||
def __init__(self, opt): |
|||
super().__init__() |
|||
|
|||
out_seqlen = 1 |
|||
fc_in = opt.out_channels*2*out_seqlen*opt.n_joints |
|||
fc_out = opt.in_channels * opt.n_joints |
|||
|
|||
self.post_refine = nn.Sequential( |
|||
nn.Linear(fc_in, fc_unit), |
|||
nn.ReLU(), |
|||
nn.Dropout(0.5,inplace=True), |
|||
nn.Linear(fc_unit, fc_out), |
|||
nn.Sigmoid() |
|||
) |
|||
|
|||
def forward(self, x, x_1): |
|||
N, T, V,_ = x.size() |
|||
x_in = torch.cat((x, x_1), -1) |
|||
x_in = x_in.view(N, -1) |
|||
|
|||
score = self.post_refine(x_in).view(N,T,V,2) |
|||
score_cm = Variable(torch.ones(score.size()), requires_grad=False).cuda() - score |
|||
x_out = x.clone() |
|||
x_out[:, :, :, :2] = score * x[:, :, :, :2] + score_cm * x_1[:, :, :, :2] |
|||
|
|||
return x_out |
|||
|
|||
|
@ -0,0 +1,171 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
import torch.nn.functional as F |
|||
from torch.autograd import Variable |
|||
import numpy as np |
|||
import math |
|||
import os |
|||
import copy |
|||
|
|||
def clones(module, N): |
|||
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) |
|||
|
|||
class Encoder(nn.Module): |
|||
def __init__(self, layer, N, length, d_model): |
|||
super(Encoder, self).__init__() |
|||
self.layers = layer |
|||
self.norm = LayerNorm(d_model) |
|||
|
|||
self.pos_embedding_1 = nn.Parameter(torch.randn(1, length, d_model)) |
|||
self.pos_embedding_2 = nn.Parameter(torch.randn(1, length, d_model)) |
|||
self.pos_embedding_3 = nn.Parameter(torch.randn(1, length, d_model)) |
|||
|
|||
def forward(self, x, mask): |
|||
for i, layer in enumerate(self.layers): |
|||
if i == 0: |
|||
x += self.pos_embedding_1[:, :x.shape[1]] |
|||
elif i == 1: |
|||
x += self.pos_embedding_2[:, :x.shape[1]] |
|||
elif i == 2: |
|||
x += self.pos_embedding_3[:, :x.shape[1]] |
|||
|
|||
x = layer(x, mask, i) |
|||
|
|||
return x |
|||
|
|||
class LayerNorm(nn.Module): |
|||
def __init__(self, features, eps=1e-6): |
|||
super(LayerNorm, self).__init__() |
|||
self.a_2 = nn.Parameter(torch.ones(features)) |
|||
self.b_2 = nn.Parameter(torch.zeros(features)) |
|||
self.eps = eps |
|||
|
|||
def forward(self, x): |
|||
mean = x.mean(-1, keepdim=True) |
|||
std = x.std(-1, keepdim=True) |
|||
return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 |
|||
|
|||
def attention(query, key, value, mask=None, dropout=None): |
|||
d_k = query.size(-1) |
|||
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) |
|||
if mask is not None: |
|||
scores = scores.masked_fill(mask == 0, -1e9) |
|||
p_attn = F.softmax(scores, dim=-1) |
|||
|
|||
if dropout is not None: |
|||
p_attn = dropout(p_attn) |
|||
return torch.matmul(p_attn, value), p_attn |
|||
|
|||
|
|||
class SublayerConnection(nn.Module): |
|||
def __init__(self, size, dropout, stride_num, i): |
|||
super(SublayerConnection, self).__init__() |
|||
self.norm = LayerNorm(size) |
|||
self.dropout = nn.Dropout(dropout) |
|||
self.pooling = nn.MaxPool1d(1, stride_num[i]) |
|||
|
|||
def forward(self, x, sublayer, i=-1, stride_num=-1): |
|||
if i != -1: |
|||
if stride_num[i] != 1: |
|||
res = self.pooling(x.permute(0, 2, 1)) |
|||
res = res.permute(0, 2, 1) |
|||
|
|||
return res + self.dropout(sublayer(self.norm(x))) |
|||
else: |
|||
return x + self.dropout(sublayer(self.norm(x))) |
|||
else: |
|||
return x + self.dropout(sublayer(self.norm(x))) |
|||
|
|||
|
|||
class EncoderLayer(nn.Module): |
|||
def __init__(self, size, self_attn, feed_forward, dropout, stride_num, i): |
|||
super(EncoderLayer, self).__init__() |
|||
self.self_attn = self_attn |
|||
self.feed_forward = feed_forward |
|||
self.stride_num = stride_num |
|||
self.sublayer = clones(SublayerConnection(size, dropout, stride_num, i), 2) |
|||
self.size = size |
|||
|
|||
def forward(self, x, mask, i): |
|||
x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask)) |
|||
x = self.sublayer[1](x, self.feed_forward, i, self.stride_num) |
|||
return x |
|||
|
|||
|
|||
class MultiHeadedAttention(nn.Module): |
|||
def __init__(self, h, d_model, dropout=0.1): |
|||
super(MultiHeadedAttention, self).__init__() |
|||
assert d_model % h == 0 |
|||
self.d_k = d_model // h |
|||
self.h = h |
|||
self.linears = clones(nn.Linear(d_model, d_model), 4) |
|||
self.attn = None |
|||
self.dropout = nn.Dropout(p=dropout) |
|||
|
|||
def forward(self, query, key, value, mask=None): |
|||
if mask is not None: |
|||
mask = mask.unsqueeze(1) |
|||
nbatches = query.size(0) |
|||
|
|||
query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) |
|||
for l, x in zip(self.linears, (query, key, value))] |
|||
|
|||
x, self.attn = attention(query, key, value, mask=mask, |
|||
dropout=self.dropout) |
|||
|
|||
x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k) |
|||
return self.linears[-1](x) |
|||
|
|||
|
|||
class PositionwiseFeedForward(nn.Module): |
|||
def __init__(self, d_model, d_ff, dropout=0.1, number = -1, stride_num=-1): |
|||
super(PositionwiseFeedForward, self).__init__() |
|||
self.w_1 = nn.Conv1d(d_model, d_ff, kernel_size=1, stride=1) |
|||
self.w_2 = nn.Conv1d(d_ff, d_model, kernel_size=3, stride=stride_num[number], padding = 1) |
|||
|
|||
self.gelu = nn.ReLU() |
|||
|
|||
self.dropout = nn.Dropout(dropout) |
|||
|
|||
def forward(self, x): |
|||
x = x.permute(0, 2, 1) |
|||
x = self.w_2(self.dropout(self.gelu(self.w_1(x)))) |
|||
x = x.permute(0, 2, 1) |
|||
|
|||
return x |
|||
|
|||
class Transformer(nn.Module): |
|||
def __init__(self, n_layers=3, d_model=256, d_ff=512, h=8, length=27, stride_num=None, dropout=0.1): |
|||
super(Transformer, self).__init__() |
|||
|
|||
self.length = length |
|||
|
|||
self.stride_num = stride_num |
|||
self.model = self.make_model(N=n_layers, d_model=d_model, d_ff=d_ff, h=h, dropout=dropout, length = self.length) |
|||
|
|||
def forward(self, x, mask=None): |
|||
x = self.model(x, mask) |
|||
|
|||
return x |
|||
|
|||
def make_model(self, N=3, d_model=256, d_ff=512, h=8, dropout=0.1, length=27): |
|||
c = copy.deepcopy |
|||
attn = MultiHeadedAttention(h, d_model) |
|||
|
|||
model_EncoderLayer = [] |
|||
for i in range(N): |
|||
ff = PositionwiseFeedForward(d_model, d_ff, dropout, i, self.stride_num) |
|||
model_EncoderLayer.append(EncoderLayer(d_model, c(attn), c(ff), dropout, self.stride_num, i)) |
|||
|
|||
model_EncoderLayer = nn.ModuleList(model_EncoderLayer) |
|||
|
|||
model = Encoder(model_EncoderLayer, N, length, d_model) |
|||
|
|||
return model |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,133 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
import torch.nn.functional as F |
|||
from torch.autograd import Variable |
|||
import numpy as np |
|||
import math |
|||
import os |
|||
import copy |
|||
|
|||
def clones(module, N): |
|||
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) |
|||
|
|||
class Encoder(nn.Module): |
|||
def __init__(self, layer, N): |
|||
super(Encoder, self).__init__() |
|||
self.layers = clones(layer, N) |
|||
self.norm = LayerNorm(layer.size) |
|||
|
|||
def forward(self, x, mask): |
|||
for layer in self.layers: |
|||
x = layer(x, mask) |
|||
return x |
|||
|
|||
class LayerNorm(nn.Module): |
|||
def __init__(self, features, eps=1e-6): |
|||
super(LayerNorm, self).__init__() |
|||
self.a_2 = nn.Parameter(torch.ones(features)) |
|||
self.b_2 = nn.Parameter(torch.zeros(features)) |
|||
self.eps = eps |
|||
|
|||
def forward(self, x): |
|||
mean = x.mean(-1, keepdim=True) |
|||
std = x.std(-1, keepdim=True) |
|||
return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 |
|||
|
|||
def attention(query, key, value, mask=None, dropout=None): |
|||
d_k = query.size(-1) |
|||
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) |
|||
|
|||
if mask is not None: |
|||
scores = scores.masked_fill(mask == 0, -1e9) |
|||
p_attn = F.softmax(scores, dim=-1) |
|||
|
|||
if dropout is not None: |
|||
p_attn = dropout(p_attn) |
|||
return torch.matmul(p_attn, value), p_attn |
|||
|
|||
|
|||
class SublayerConnection(nn.Module): |
|||
def __init__(self, size, dropout): |
|||
super(SublayerConnection, self).__init__() |
|||
self.norm = LayerNorm(size) |
|||
self.dropout = nn.Dropout(dropout) |
|||
|
|||
def forward(self, x, sublayer): |
|||
return x + self.dropout(sublayer(self.norm(x))) |
|||
|
|||
|
|||
class EncoderLayer(nn.Module): |
|||
def __init__(self, size, self_attn, feed_forward, dropout): |
|||
super(EncoderLayer, self).__init__() |
|||
self.self_attn = self_attn |
|||
self.feed_forward = feed_forward |
|||
self.sublayer = clones(SublayerConnection(size, dropout), 2) |
|||
self.size = size |
|||
|
|||
def forward(self, x, mask): |
|||
x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask)) |
|||
return self.sublayer[1](x, self.feed_forward) |
|||
|
|||
|
|||
class MultiHeadedAttention(nn.Module): |
|||
def __init__(self, h, d_model, dropout=0.1): |
|||
super(MultiHeadedAttention, self).__init__() |
|||
assert d_model % h == 0 |
|||
self.d_k = d_model // h |
|||
self.h = h |
|||
self.linears = clones(nn.Linear(d_model, d_model), 4) |
|||
self.attn = None |
|||
self.dropout = nn.Dropout(p=dropout) |
|||
|
|||
def forward(self, query, key, value, mask=None): |
|||
if mask is not None: |
|||
mask = mask.unsqueeze(1) |
|||
nbatches = query.size(0) |
|||
|
|||
query, key, value = \ |
|||
[l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) |
|||
for l, x in zip(self.linears, (query, key, value))] |
|||
|
|||
x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout) |
|||
|
|||
x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k) |
|||
return self.linears[-1](x) |
|||
|
|||
|
|||
class PositionwiseFeedForward(nn.Module): |
|||
def __init__(self, d_model, d_ff, dropout=0.1): |
|||
super(PositionwiseFeedForward, self).__init__() |
|||
self.w_1 = nn.Linear(d_model, d_ff) |
|||
self.w_2 = nn.Linear(d_ff, d_model) |
|||
self.gelu = nn.ReLU() |
|||
self.dropout = nn.Dropout(dropout) |
|||
|
|||
def forward(self, x): |
|||
return self.w_2(self.dropout(self.gelu(self.w_1(x)))) |
|||
|
|||
class Transformer(nn.Module): |
|||
def __init__(self, n_layers=3, d_model=256, d_ff=512, h=8, dropout=0.1, length=27): |
|||
super(Transformer, self).__init__() |
|||
|
|||
self.pos_embedding = nn.Parameter(torch.randn(1, length, d_model)) |
|||
self.model = self.make_model(N=n_layers, d_model=d_model, d_ff=d_ff, h=h, dropout=dropout) |
|||
|
|||
def forward(self, x, mask=None): |
|||
|
|||
x += self.pos_embedding |
|||
|
|||
x = self.model(x, mask) |
|||
|
|||
return x |
|||
|
|||
def make_model(self, N=3, d_model=256, d_ff=512, h=8, dropout=0.1): |
|||
c = copy.deepcopy |
|||
attn = MultiHeadedAttention(h, d_model) |
|||
ff = PositionwiseFeedForward(d_model, d_ff, dropout) |
|||
model = Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N) |
|||
return model |
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,158 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
import torch.nn.functional as F |
|||
from torch.autograd import Variable |
|||
import numpy as np |
|||
import math |
|||
import os |
|||
import copy |
|||
|
|||
def clones(module, N): |
|||
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) |
|||
|
|||
class Encoder(nn.Module): |
|||
def __init__(self, layer, N): |
|||
super(Encoder, self).__init__() |
|||
self.layers = clones(layer, N) |
|||
self.norm = LayerNorm(layer.size) |
|||
|
|||
def forward(self, x, mask): |
|||
for layer in self.layers: |
|||
x = layer(x, mask) |
|||
return x |
|||
|
|||
class LayerNorm(nn.Module): |
|||
def __init__(self, features, eps=1e-6): |
|||
super(LayerNorm, self).__init__() |
|||
self.a_2 = nn.Parameter(torch.ones(features)) |
|||
self.b_2 = nn.Parameter(torch.zeros(features)) |
|||
self.eps = eps |
|||
|
|||
def forward(self, x): |
|||
mean = x.mean(-1, keepdim=True) |
|||
std = x.std(-1, keepdim=True) |
|||
return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 |
|||
|
|||
def attention(query, key, value, mask=None, dropout=None): |
|||
d_k = query.size(-1) |
|||
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) |
|||
|
|||
if mask is not None: |
|||
scores = scores.masked_fill(mask == 0, -1e9) |
|||
p_attn = F.softmax(scores, dim=-1) |
|||
|
|||
if dropout is not None: |
|||
p_attn = dropout(p_attn) |
|||
return torch.matmul(p_attn, value), p_attn |
|||
|
|||
|
|||
class SublayerConnection(nn.Module): |
|||
def __init__(self, size, dropout): |
|||
super(SublayerConnection, self).__init__() |
|||
self.norm = LayerNorm(size) |
|||
self.dropout = nn.Dropout(dropout) |
|||
|
|||
def forward(self, x, sublayer): |
|||
return x + self.dropout(sublayer(self.norm(x))) |
|||
|
|||
|
|||
class EncoderLayer(nn.Module): |
|||
def __init__(self, size, self_attn, feed_forward, dropout): |
|||
super(EncoderLayer, self).__init__() |
|||
self.self_attn = self_attn |
|||
self.feed_forward = feed_forward |
|||
self.sublayer = clones(SublayerConnection(size, dropout), 2) |
|||
self.size = size |
|||
|
|||
def forward(self, x, mask): |
|||
x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask)) |
|||
return self.sublayer[1](x, self.feed_forward) |
|||
|
|||
|
|||
class MultiHeadedAttention(nn.Module): |
|||
def __init__(self, h, d_model, dropout=0.1): |
|||
super(MultiHeadedAttention, self).__init__() |
|||
assert d_model % h == 0 |
|||
self.d_k = d_model // h |
|||
self.h = h |
|||
self.linears = clones(nn.Linear(d_model, d_model), 4) |
|||
self.attn = None |
|||
self.dropout = nn.Dropout(p=dropout) |
|||
|
|||
def forward(self, query, key, value, mask=None): |
|||
if mask is not None: |
|||
mask = mask.unsqueeze(1) |
|||
nbatches = query.size(0) |
|||
|
|||
query, key, value = \ |
|||
[l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) |
|||
for l, x in zip(self.linears, (query, key, value))] |
|||
|
|||
x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout) |
|||
|
|||
x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k) |
|||
return self.linears[-1](x) |
|||
|
|||
|
|||
class PositionwiseFeedForward(nn.Module): |
|||
def __init__(self, d_model, d_ff, dropout=0.1): |
|||
super(PositionwiseFeedForward, self).__init__() |
|||
self.w_1 = nn.Linear(d_model, d_ff) |
|||
self.w_2 = nn.Linear(d_ff, d_model) |
|||
self.gelu = nn.ReLU() |
|||
self.dropout = nn.Dropout(dropout) |
|||
|
|||
def forward(self, x): |
|||
return self.w_2(self.dropout(self.gelu(self.w_1(x)))) |
|||
|
|||
class Transformer(nn.Module): |
|||
def __init__(self, n_layers=3, d_model=256, d_ff=512, h=8, dropout=0.1, length=27): |
|||
super(Transformer, self).__init__() |
|||
|
|||
self.pos_embedding = nn.Parameter(torch.randn(1, length, d_model)) |
|||
self.model = self.make_model(N=n_layers, d_model=d_model, d_ff=d_ff, h=h, dropout=dropout) |
|||
|
|||
def forward(self, x, mask_MAE=None, mask=None): |
|||
x += self.pos_embedding |
|||
#print(str(mask_MAE)) |
|||
if mask_MAE is not None: |
|||
B, _, C = x.shape |
|||
x_vis = x[:,~mask_MAE].reshape(B, -1, C) # ~mask means visible |
|||
|
|||
x = self.model(x_vis, mask) |
|||
else: |
|||
x = self.model(x, mask) |
|||
|
|||
return x |
|||
|
|||
def make_model(self, N=3, d_model=256, d_ff=512, h=8, dropout=0.1): |
|||
c = copy.deepcopy |
|||
attn = MultiHeadedAttention(h, d_model) |
|||
ff = PositionwiseFeedForward(d_model, d_ff, dropout) |
|||
model = Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N) |
|||
return model |
|||
|
|||
|
|||
class Transformer_dec(nn.Module): |
|||
def __init__(self, n_layers=3, d_model=256, d_ff=512, h=8, dropout=0.1, length=27): |
|||
super(Transformer_dec, self).__init__() |
|||
|
|||
self.model = self.make_model(N=n_layers, d_model=d_model, d_ff=d_ff, h=h, dropout=dropout) |
|||
|
|||
|
|||
def forward(self, x, return_token_num, mask=None): |
|||
|
|||
x = self.model(x, mask) |
|||
|
|||
return x |
|||
|
|||
def make_model(self, N=3, d_model=256, d_ff=512, h=8, dropout=0.1): |
|||
c = copy.deepcopy |
|||
attn = MultiHeadedAttention(h, d_model) |
|||
ff = PositionwiseFeedForward(d_model, d_ff, dropout) |
|||
model = Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N) |
|||
return model |
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,369 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
# from model.module.trans import Transformer as Transformer_s |
|||
# from model.module.trans_hypothesis import Transformer |
|||
import numpy as np |
|||
from einops import rearrange |
|||
from collections import OrderedDict |
|||
from torch.nn import functional as F |
|||
from torch.nn import init |
|||
import scipy.sparse as sp |
|||
|
|||
from timm.models.layers import DropPath |
|||
|
|||
""" |
|||
网络结构: |
|||
128 joint -> part |
|||
+ 256 -> mlp |
|||
128 pose |
|||
""" |
|||
|
|||
|
|||
class Mlp(nn.Module): |
|||
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.1): |
|||
super().__init__() |
|||
out_features = out_features or in_features |
|||
hidden_features = hidden_features or in_features |
|||
self.fc1 = nn.Linear(in_features, hidden_features, bias=False) |
|||
self.act = act_layer() |
|||
self.fc2 = nn.Linear(hidden_features, out_features, bias=False) |
|||
self.drop = nn.Dropout(drop) |
|||
|
|||
def forward(self, x): |
|||
x = self.fc1(x) |
|||
x = self.act(x) |
|||
x = self.drop(x) |
|||
x = self.fc2(x) |
|||
x = self.drop(x) |
|||
return x |
|||
|
|||
|
|||
class Joint_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
self.qkv = nn.Linear(d_coor, d_coor * 3) |
|||
self.head = head |
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.scale = (d_coor) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
|
|||
self.pos_emb = nn.Embedding(d_time, d_coor) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
|
|||
|
|||
emb = self.pos_emb(self.frame_idx) |
|||
input = input + emb[None, :, None, :] |
|||
|
|||
input = self.layer_norm(input) |
|||
|
|||
qkv = self.qkv(input) # b, t, s, c-> b, t, s, 3*c |
|||
qkv_t = qkv.reshape(b, t, s, c, 3).permute(4, 0, 1, 2, 3) # 3,b,t,s,c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,s,c |
|||
|
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) # b,t,s,c -> b*h*s,t,c//h |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t ', h=self.head) # b,t,s,c-> b*h*s,c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h*s,t,t |
|||
att_t = att_t.softmax(-1) # b*h*s,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t s c -> b c t s ') |
|||
|
|||
# MSA |
|||
v_t = rearrange(v_t, 'b (h c) t s -> (b h s) t c', h = self.head) # b*h*s,t,c//h |
|||
|
|||
x_t = att_t @ v_t # b*h*s,t,c//h |
|||
|
|||
x_t = rearrange(x_t, '(b h s) t c -> b t s (h c)', s=s, h=self.head) # b,t,s,c |
|||
|
|||
return x_t |
|||
|
|||
|
|||
class Part_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, part_list, head=8): |
|||
super().__init__() |
|||
|
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
|
|||
self.head = head |
|||
|
|||
self.num_of_part = len(part_list) |
|||
self.num_joint_of_part = len(part_list[0]) |
|||
|
|||
self.scale = (d_coor * self.num_joint_of_part) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.layer_norm = nn.LayerNorm(d_coor * self.num_joint_of_part) |
|||
|
|||
self.pos_embed = nn.Embedding(d_time, d_coor * self.num_joint_of_part) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.qkv = nn.Linear(d_coor * self.num_joint_of_part, d_coor * self.num_joint_of_part * 3) |
|||
self.drop = DropPath(0.5) |
|||
# check part_list |
|||
for part in part_list: |
|||
assert len(part) == 3 # each part should have 3 joints |
|||
for idx in part: |
|||
assert 0 <= idx < d_joint # joint index should be less than d_joint |
|||
|
|||
self.idx_joint2part = torch.tensor([idx for part in part_list for idx in part], dtype=torch.long) |
|||
self.idx_joint2part = self.idx_joint2part.flatten().cuda() |
|||
idx_part2joint = list(range(d_joint)) |
|||
for i, idx in enumerate(self.idx_joint2part): |
|||
idx_part2joint[idx] = i |
|||
self.idx_part2joint = torch.tensor(idx_part2joint, dtype=torch.long).cuda() |
|||
|
|||
self.overlap = self.get_overlap() |
|||
|
|||
# 查找有重叠的内容 |
|||
def get_overlap(self): |
|||
overlap_list = [-1] * self.d_joint |
|||
for i, idx in enumerate(self.idx_joint2part): |
|||
if overlap_list[idx] == -1: |
|||
overlap_list[idx] = i |
|||
else: |
|||
if not isinstance(overlap_list[idx], list): |
|||
overlap_i = overlap_list[idx] |
|||
overlap_list[idx] = list() |
|||
overlap_list[idx].append(overlap_i) |
|||
overlap_list[idx].append(i) |
|||
|
|||
overlap = [] |
|||
for i in overlap_list: |
|||
if isinstance(i, list): |
|||
overlap.append(i) |
|||
|
|||
if len(overlap) == 0: |
|||
return None |
|||
else: |
|||
return overlap |
|||
|
|||
def forward(self, input): |
|||
input = torch.index_select(input, 2, self.idx_joint2part) |
|||
input = rearrange(input, 'b t (p j) c -> b t p (j c)', j=self.num_joint_of_part) |
|||
|
|||
b, t, p, c = input.shape |
|||
|
|||
emb = self.pos_embed(self.frame_idx) |
|||
input = input + emb[None, :, None, :] |
|||
|
|||
input = self.layer_norm(input) |
|||
|
|||
qkv = self.qkv(input) # b, t, p, c-> b, t, p, 3*c |
|||
qkv_t = qkv.reshape(b, t, p, c, 3).permute(4, 0, 1, 2, 3) # 3,b,t,p,c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,p,c |
|||
|
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) # b,t,p,c -> b*h*p,t,c//h |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t', h=self.head) # b,t,p,c-> b*h*p,c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h*p,t,t |
|||
att_t = att_t.softmax(-1) # b*h*p,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t p c -> b c t p') |
|||
|
|||
# MSA |
|||
v_t = rearrange(v_t, 'b (h c) t p -> (b h p) t c ', h=self.head) # b*h*p,t,c//h |
|||
|
|||
x_t = att_t @ v_t # b*h*p,t,c//h |
|||
|
|||
x = rearrange(x_t, '(b h p) t c -> b h t p c', h=self.head, p=p) # b*h*p,t,c//h -> b,h,t,p,c//h |
|||
|
|||
# 还原 |
|||
x = rearrange(x, 'b h t p c -> b t p (h c)') |
|||
x = rearrange(x, 'b t p (j c) -> b t (p j) c', j=self.num_joint_of_part) |
|||
|
|||
# 查找重叠 求均值 |
|||
if self.overlap: |
|||
for overlap in self.overlap: |
|||
idx = overlap[-1] |
|||
for i in overlap[:-1]: |
|||
x[:, :, idx, :] += x[:, :, i, :] |
|||
x[:, :, idx, :] /= len(overlap) |
|||
|
|||
x = torch.index_select(x, 2, self.idx_part2joint) |
|||
return x |
|||
|
|||
|
|||
class Pose_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
self.head = head |
|||
|
|||
self.scale = (d_coor * d_joint) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.layer_norm = nn.LayerNorm(d_coor * self.d_joint) |
|||
|
|||
self.pos_emb = nn.Embedding(d_time, d_coor * d_joint) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.qkv = nn.Linear(d_coor * d_joint, d_coor * d_joint * 3) |
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
input = rearrange(input, 'b t s c -> b t (s c)') |
|||
|
|||
emb = self.pos_emb(self.frame_idx) |
|||
input = input + emb[None, :] |
|||
|
|||
input = self.layer_norm(input) |
|||
|
|||
qkv = self.qkv(input) # b, t, s*c -> b, t, 3*s*c |
|||
qkv_t = qkv.reshape(b, t, s*c, 3).permute(3, 0, 1, 2) # 3,b,t,s*c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,s*c |
|||
|
|||
# reshape for mat |
|||
q_t = rearrange(q_t, 'b t (h c) -> (b h) t c', h=self.head) # b,t,s*c -> b*h,t,s*c//h |
|||
k_t = rearrange(k_t, 'b t (h c) -> (b h) c t ', h=self.head) # b,t,s*c-> b*h,s*c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h,t,t |
|||
att_t = att_t.softmax(-1) # b*h,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t (h c) -> (b h) t c', h=self.head) # b*h,t,s*c//h |
|||
|
|||
x_t = att_t @ v_t # b*h,t,s*c//h |
|||
|
|||
x_t = rearrange(x_t, '(b h) t (s c) -> b t s (h c) ', h=self.head, s=s) # b*h,t,s*c//h -> b,t,s,c |
|||
|
|||
return x_t |
|||
|
|||
|
|||
class HP_BLOCK(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, part_list): |
|||
super().__init__() |
|||
|
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.mlp = Mlp(d_coor, d_coor*4, d_coor) |
|||
|
|||
self.joint_att = Joint_ATTENTION(d_time, d_joint, d_coor//2) |
|||
self.part_att = Part_ATTENTION(d_time, d_joint, d_coor//2, part_list) |
|||
self.pose_att = Pose_ATTENTION(d_time, d_joint, d_coor//2) |
|||
|
|||
self.drop = DropPath(0.0) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
h = input |
|||
# x = self.layer_norm(input) |
|||
|
|||
x_joint, x_pose = input.chunk(2, 3) |
|||
|
|||
x = torch.cat(( |
|||
self.part_att(self.joint_att(x_joint)), |
|||
self.pose_att(x_pose) |
|||
), -1) |
|||
|
|||
x = x + h |
|||
x = x + self.drop(self.mlp(self.layer_norm(x))) |
|||
|
|||
return x |
|||
|
|||
|
|||
class HPFormer(nn.Module): |
|||
def __init__(self, num_block, d_time, d_joint, d_coor, part_list): |
|||
super(HPFormer, self).__init__() |
|||
|
|||
self.num_block = num_block |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.d_coor = d_coor |
|||
self.part_list = part_list |
|||
|
|||
self.hp_block = [] |
|||
for l in range(self.num_block): |
|||
self.hp_block.append(HP_BLOCK(self.d_time, self.d_joint, self.d_coor, self.part_list)) |
|||
self.hp_block = nn.ModuleList(self.hp_block) |
|||
|
|||
def forward(self, input): |
|||
for i in range(self.num_block): |
|||
input = self.hp_block[i](input) |
|||
|
|||
return input |
|||
|
|||
|
|||
class Model(nn.Module): |
|||
def __init__(self, args): |
|||
super().__init__() |
|||
|
|||
layers, d_hid, frames = args.layers, args.d_hid, args.frames |
|||
num_joints_in, num_joints_out = args.n_joints, args.out_joints |
|||
part_list = args.part_list |
|||
|
|||
# layers, length, d_hid = layers, frames, d_hid |
|||
# num_joints_in, num_joints_out = 17,17 |
|||
|
|||
self.pose_emb = nn.Linear(2, d_hid, bias=False) |
|||
self.gelu = nn.GELU() |
|||
self.hpformer = HPFormer(layers, frames, num_joints_in, d_hid, part_list) |
|||
self.regress_head = nn.Linear(d_hid, 3, bias=False) |
|||
|
|||
def forward(self, x): |
|||
# b, t, s, c = x.shape #batch,frame,joint,coordinate |
|||
# dimension tranfer |
|||
x = self.pose_emb(x) |
|||
x = self.gelu(x) |
|||
# spatio-temporal correlation |
|||
x = self.hpformer(x) |
|||
# regression head |
|||
x = self.regress_head(x) |
|||
|
|||
return x |
|||
|
|||
class Args: |
|||
def __init__(self, layers, d_hid, frames, n_joints, out_joints): |
|||
self.layers = layers |
|||
self.d_hid = d_hid |
|||
self.frames = frames |
|||
self.n_joints = n_joints |
|||
self.out_joints = out_joints |
|||
|
|||
if __name__ == "__main__": |
|||
# inputs = torch.rand(64, 351, 34) # [btz, channel, T, H, W] |
|||
# inputs = torch.rand(1, 64, 4, 112, 112) #[btz, channel, T, H, W] |
|||
args = Args(layers=6, d_hid=192, frames=27, n_joints=17, out_joints=17) |
|||
args.part_list = [ |
|||
[8, 9, 10], # 头 |
|||
[0, 7, 8], # 身体 |
|||
[11, 12, 13], # 左手 |
|||
[14, 15, 16], # 右手 |
|||
[4, 5, 6], # 左腿 |
|||
[1, 2, 3] #右腿 |
|||
] |
|||
net = Model(args) |
|||
inputs = torch.rand([1, 27, 17, 2]) |
|||
if torch.cuda.is_available(): |
|||
net = net.cuda() |
|||
inputs = inputs.cuda() |
|||
output = net(inputs) |
|||
print(output.size()) |
|||
|
|||
from thop import profile |
|||
# flops = 2*macs, 计算模型的计算量和参数量 |
|||
macs, params = profile(net, inputs=(inputs,)) |
|||
print(2*macs) |
|||
print(params) |
@ -0,0 +1,384 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
# from model.module.trans import Transformer as Transformer_s |
|||
# from model.module.trans_hypothesis import Transformer |
|||
import numpy as np |
|||
from einops import rearrange |
|||
from collections import OrderedDict |
|||
from torch.nn import functional as F |
|||
from torch.nn import init |
|||
import scipy.sparse as sp |
|||
|
|||
from timm.models.layers import DropPath |
|||
|
|||
""" |
|||
网络结构: |
|||
128 joint 128 part |
|||
+ = 256 -> mlp * 3 -> + = 256 -> mlp * 3 |
|||
128 pose 128 pose |
|||
""" |
|||
|
|||
|
|||
class Mlp(nn.Module): |
|||
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.1): |
|||
super().__init__() |
|||
out_features = out_features or in_features |
|||
hidden_features = hidden_features or in_features |
|||
self.fc1 = nn.Linear(in_features, hidden_features, bias=False) |
|||
self.act = act_layer() |
|||
self.fc2 = nn.Linear(hidden_features, out_features, bias=False) |
|||
self.drop = nn.Dropout(drop) |
|||
|
|||
def forward(self, x): |
|||
x = self.fc1(x) |
|||
x = self.act(x) |
|||
x = self.drop(x) |
|||
x = self.fc2(x) |
|||
x = self.drop(x) |
|||
return x |
|||
|
|||
|
|||
class Joint_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
self.qkv = nn.Linear(d_coor, d_coor * 3) |
|||
self.head = head |
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.scale = (d_coor) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
|
|||
self.pos_emb = nn.Embedding(d_time, d_coor) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
|
|||
|
|||
emb = self.pos_emb(self.frame_idx) |
|||
input = input + emb[None, :, None, :] |
|||
|
|||
input = self.layer_norm(input) |
|||
|
|||
qkv = self.qkv(input) # b, t, s, c-> b, t, s, 3*c |
|||
qkv_t = qkv.reshape(b, t, s, c, 3).permute(4, 0, 1, 2, 3) # 3,b,t,s,c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,s,c |
|||
|
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) # b,t,s,c -> b*h*s,t,c//h |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t ', h=self.head) # b,t,s,c-> b*h*s,c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h*s,t,t |
|||
att_t = att_t.softmax(-1) # b*h*s,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t s c -> b c t s ') |
|||
|
|||
# MSA |
|||
v_t = rearrange(v_t, 'b (h c) t s -> (b h s) t c', h = self.head) # b*h*s,t,c//h |
|||
|
|||
x_t = att_t @ v_t # b*h*s,t,c//h |
|||
|
|||
x_t = rearrange(x_t, '(b h s) t c -> b t s (h c)', s=s, h=self.head) # b,t,s,c |
|||
|
|||
return x_t |
|||
|
|||
|
|||
class Part_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, part_list, head=8): |
|||
super().__init__() |
|||
|
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
|
|||
self.head = head |
|||
|
|||
self.num_of_part = len(part_list) |
|||
self.num_joint_of_part = len(part_list[0]) |
|||
|
|||
self.scale = (d_coor * self.num_joint_of_part) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.layer_norm = nn.LayerNorm(d_coor * self.num_joint_of_part) |
|||
|
|||
self.pos_embed = nn.Embedding(d_time, d_coor * self.num_joint_of_part) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.qkv = nn.Linear(d_coor * self.num_joint_of_part, d_coor * self.num_joint_of_part * 3) |
|||
self.drop = DropPath(0.5) |
|||
# check part_list |
|||
for part in part_list: |
|||
assert len(part) == 3 # each part should have 3 joints |
|||
for idx in part: |
|||
assert 0 <= idx < d_joint # joint index should be less than d_joint |
|||
|
|||
self.idx_joint2part = torch.tensor([idx for part in part_list for idx in part], dtype=torch.long) |
|||
self.idx_joint2part = self.idx_joint2part.flatten().cuda() |
|||
idx_part2joint = list(range(d_joint)) |
|||
for i, idx in enumerate(self.idx_joint2part): |
|||
idx_part2joint[idx] = i |
|||
self.idx_part2joint = torch.tensor(idx_part2joint, dtype=torch.long).cuda() |
|||
|
|||
self.overlap = self.get_overlap() |
|||
|
|||
# 查找有重叠的内容 |
|||
def get_overlap(self): |
|||
overlap_list = [-1] * self.d_joint |
|||
for i, idx in enumerate(self.idx_joint2part): |
|||
if overlap_list[idx] == -1: |
|||
overlap_list[idx] = i |
|||
else: |
|||
if not isinstance(overlap_list[idx], list): |
|||
overlap_i = overlap_list[idx] |
|||
overlap_list[idx] = list() |
|||
overlap_list[idx].append(overlap_i) |
|||
overlap_list[idx].append(i) |
|||
|
|||
overlap = [] |
|||
for i in overlap_list: |
|||
if isinstance(i, list): |
|||
overlap.append(i) |
|||
|
|||
if len(overlap) == 0: |
|||
return None |
|||
else: |
|||
return overlap |
|||
|
|||
def forward(self, input): |
|||
input = torch.index_select(input, 2, self.idx_joint2part) |
|||
input = rearrange(input, 'b t (p j) c -> b t p (j c)', j=self.num_joint_of_part) |
|||
|
|||
b, t, p, c = input.shape |
|||
|
|||
emb = self.pos_embed(self.frame_idx) |
|||
input = input + emb[None, :, None, :] |
|||
|
|||
input = self.layer_norm(input) |
|||
|
|||
qkv = self.qkv(input) # b, t, p, c-> b, t, p, 3*c |
|||
qkv_t = qkv.reshape(b, t, p, c, 3).permute(4, 0, 1, 2, 3) # 3,b,t,p,c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,p,c |
|||
|
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) # b,t,p,c -> b*h*p,t,c//h |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t', h=self.head) # b,t,p,c-> b*h*p,c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h*p,t,t |
|||
att_t = att_t.softmax(-1) # b*h*p,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t p c -> b c t p') |
|||
|
|||
# MSA |
|||
v_t = rearrange(v_t, 'b (h c) t p -> (b h p) t c ', h=self.head) # b*h*p,t,c//h |
|||
|
|||
x_t = att_t @ v_t # b*h*p,t,c//h |
|||
|
|||
x = rearrange(x_t, '(b h p) t c -> b h t p c', h=self.head, p=p) # b*h*p,t,c//h -> b,h,t,p,c//h |
|||
|
|||
# 还原 |
|||
x = rearrange(x, 'b h t p c -> b t p (h c)') |
|||
x = rearrange(x, 'b t p (j c) -> b t (p j) c', j=self.num_joint_of_part) |
|||
|
|||
# 查找重叠 求均值 |
|||
if self.overlap: |
|||
for overlap in self.overlap: |
|||
idx = overlap[-1] |
|||
for i in overlap[:-1]: |
|||
x[:, :, idx, :] += x[:, :, i, :] |
|||
x[:, :, idx, :] /= len(overlap) |
|||
|
|||
x = torch.index_select(x, 2, self.idx_part2joint) |
|||
return x |
|||
|
|||
|
|||
class Pose_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
self.head = head |
|||
|
|||
self.scale = (d_coor * d_joint) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.layer_norm = nn.LayerNorm(d_coor * self.d_joint) |
|||
|
|||
self.pos_emb = nn.Embedding(d_time, d_coor * d_joint) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.qkv = nn.Linear(d_coor * d_joint, d_coor * d_joint * 3) |
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
input = rearrange(input, 'b t s c -> b t (s c)') |
|||
|
|||
emb = self.pos_emb(self.frame_idx) |
|||
input = input + emb[None, :] |
|||
|
|||
input = self.layer_norm(input) |
|||
|
|||
qkv = self.qkv(input) # b, t, s*c -> b, t, 3*s*c |
|||
qkv_t = qkv.reshape(b, t, s*c, 3).permute(3, 0, 1, 2) # 3,b,t,s*c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,s*c |
|||
|
|||
# reshape for mat |
|||
q_t = rearrange(q_t, 'b t (h c) -> (b h) t c', h=self.head) # b,t,s*c -> b*h,t,s*c//h |
|||
k_t = rearrange(k_t, 'b t (h c) -> (b h) c t ', h=self.head) # b,t,s*c-> b*h,s*c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h,t,t |
|||
att_t = att_t.softmax(-1) # b*h,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t (h c) -> (b h) t c', h=self.head) # b*h,t,s*c//h |
|||
|
|||
x_t = att_t @ v_t # b*h,t,s*c//h |
|||
|
|||
x_t = rearrange(x_t, '(b h) t (s c) -> b t s (h c) ', h=self.head, s=s) # b*h,t,s*c//h -> b,t,s,c |
|||
|
|||
return x_t |
|||
|
|||
|
|||
class HP_BLOCK(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, part_list, count): |
|||
super().__init__() |
|||
|
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.mlp = Mlp(d_coor, d_coor*4, d_coor) |
|||
|
|||
self.count = count |
|||
|
|||
if count < 3: |
|||
self.joint_att = Joint_ATTENTION(d_time, d_joint, d_coor//2) |
|||
else: |
|||
self.part_att = Part_ATTENTION(d_time, d_joint, d_coor//2, part_list) |
|||
|
|||
self.pose_att = Pose_ATTENTION(d_time, d_joint, d_coor//2) |
|||
|
|||
self.drop = DropPath(0.0) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
h = input |
|||
# x = self.layer_norm(input) |
|||
|
|||
x_joint, x_pose = input.chunk(2, 3) |
|||
|
|||
if self.count < 3: |
|||
x = torch.cat(( |
|||
self.joint_att(x_joint), |
|||
self.pose_att(x_pose) |
|||
), -1) |
|||
else: |
|||
x = torch.cat(( |
|||
self.part_att(x_joint), |
|||
self.pose_att(x_pose) |
|||
), -1) |
|||
|
|||
x = x + h |
|||
x = x + self.drop(self.mlp(self.layer_norm(x))) |
|||
|
|||
return x |
|||
|
|||
|
|||
|
|||
class HPFormer(nn.Module): |
|||
def __init__(self, num_block, d_time, d_joint, d_coor, part_list): |
|||
super(HPFormer, self).__init__() |
|||
|
|||
self.num_block = num_block |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.d_coor = d_coor |
|||
self.part_list = part_list |
|||
|
|||
self.hp_block = [] |
|||
for i in range(self.num_block): |
|||
self.hp_block.append(HP_BLOCK(self.d_time, self.d_joint, self.d_coor, self.part_list, i)) |
|||
self.hp_block = nn.ModuleList(self.hp_block) |
|||
|
|||
def forward(self, input): |
|||
for i in range(self.num_block): |
|||
input = self.hp_block[i](input) |
|||
|
|||
return input |
|||
|
|||
|
|||
class Model(nn.Module): |
|||
def __init__(self, args): |
|||
super().__init__() |
|||
|
|||
layers, d_hid, frames = args.layers, args.d_hid, args.frames |
|||
num_joints_in, num_joints_out = args.n_joints, args.out_joints |
|||
part_list = args.part_list |
|||
|
|||
# layers, length, d_hid = layers, frames, d_hid |
|||
# num_joints_in, num_joints_out = 17,17 |
|||
|
|||
self.pose_emb = nn.Linear(2, d_hid, bias=False) |
|||
self.gelu = nn.GELU() |
|||
self.hpformer = HPFormer(layers, frames, num_joints_in, d_hid, part_list) |
|||
self.regress_head = nn.Linear(d_hid, 3, bias=False) |
|||
|
|||
def forward(self, x): |
|||
# b, t, s, c = x.shape #batch,frame,joint,coordinate |
|||
# dimension tranfer |
|||
x = self.pose_emb(x) |
|||
x = self.gelu(x) |
|||
# spatio-temporal correlation |
|||
x = self.hpformer(x) |
|||
# regression head |
|||
x = self.regress_head(x) |
|||
|
|||
return x |
|||
|
|||
class Args: |
|||
def __init__(self, layers, d_hid, frames, n_joints, out_joints): |
|||
self.layers = layers |
|||
self.d_hid = d_hid |
|||
self.frames = frames |
|||
self.n_joints = n_joints |
|||
self.out_joints = out_joints |
|||
|
|||
if __name__ == "__main__": |
|||
# inputs = torch.rand(64, 351, 34) # [btz, channel, T, H, W] |
|||
# inputs = torch.rand(1, 64, 4, 112, 112) #[btz, channel, T, H, W] |
|||
args = Args(layers=6, d_hid=192, frames=27, n_joints=17, out_joints=17) |
|||
args.part_list = [ |
|||
[8, 9, 10], # 头 |
|||
[0, 7, 8], # 身体 |
|||
[11, 12, 13], # 左手 |
|||
[14, 15, 16], # 右手 |
|||
[4, 5, 6], # 左腿 |
|||
[1, 2, 3] #右腿 |
|||
] |
|||
import os |
|||
os.environ["CUDA_VISIBLE_DEVICES"] = "1" |
|||
|
|||
net = Model(args) |
|||
inputs = torch.rand([1, 27, 17, 2]) |
|||
if torch.cuda.is_available(): |
|||
net = net.cuda() |
|||
inputs = inputs.cuda() |
|||
output = net(inputs) |
|||
print(output.size()) |
|||
|
|||
from thop import profile |
|||
# flops = 2*macs, 计算模型的计算量和参数量 |
|||
macs, params = profile(net, inputs=(inputs,)) |
|||
print(2*macs) |
|||
print(params) |
@ -0,0 +1,364 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
# from model.module.trans import Transformer as Transformer_s |
|||
# from model.module.trans_hypothesis import Transformer |
|||
import numpy as np |
|||
from einops import rearrange |
|||
from collections import OrderedDict |
|||
from torch.nn import functional as F |
|||
from torch.nn import init |
|||
import scipy.sparse as sp |
|||
|
|||
from timm.models.layers import DropPath |
|||
|
|||
|
|||
|
|||
class Mlp(nn.Module): |
|||
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.1): |
|||
super().__init__() |
|||
out_features = out_features or in_features |
|||
hidden_features = hidden_features or in_features |
|||
self.fc1 = nn.Linear(in_features, hidden_features, bias=False) |
|||
self.act = act_layer() |
|||
self.fc2 = nn.Linear(hidden_features, out_features, bias=False) |
|||
self.drop = nn.Dropout(drop) |
|||
|
|||
def forward(self, x): |
|||
x = self.fc1(x) |
|||
x = self.act(x) |
|||
x = self.drop(x) |
|||
x = self.fc2(x) |
|||
x = self.drop(x) |
|||
return x |
|||
|
|||
|
|||
class Joint_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
self.qkv = nn.Linear(d_coor, d_coor * 3) |
|||
self.head = head |
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.scale = (d_coor) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
|
|||
self.pos_emb = nn.Embedding(d_time, d_coor) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
|
|||
|
|||
emb = self.pos_emb(self.frame_idx) |
|||
input = input + emb[None, :, None, :] |
|||
|
|||
input = self.layer_norm(input) |
|||
|
|||
qkv = self.qkv(input) # b, t, s, c-> b, t, s, 3*c |
|||
qkv_t = qkv.reshape(b, t, s, c, 3).permute(4, 0, 1, 2, 3) # 3,b,t,s,c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,s,c |
|||
|
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) # b,t,s,c -> b*h*s,t,c//h |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t ', h=self.head) # b,t,s,c-> b*h*s,c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h*s,t,t |
|||
att_t = att_t.softmax(-1) # b*h*s,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t s c -> b c t s ') |
|||
|
|||
# MSA |
|||
v_t = rearrange(v_t, 'b (h c) t s -> (b h s) t c', h = self.head) # b*h*s,t,c//h |
|||
|
|||
x_t = att_t @ v_t # b*h*s,t,c//h |
|||
|
|||
x_t = rearrange(x_t, '(b h s) t c -> b t s (h c)', s=s, h=self.head) # b,t,s,c |
|||
|
|||
return x_t |
|||
|
|||
|
|||
class Part_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, part_list, head=8): |
|||
super().__init__() |
|||
|
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
|
|||
self.head = head |
|||
|
|||
self.num_of_part = len(part_list) |
|||
self.num_joint_of_part = len(part_list[0]) |
|||
|
|||
self.scale = (d_coor * self.num_joint_of_part) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.layer_norm = nn.LayerNorm(d_coor * self.num_joint_of_part) |
|||
|
|||
self.pos_embed = nn.Embedding(d_time, d_coor * self.num_joint_of_part) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.qkv = nn.Linear(d_coor * self.num_joint_of_part, d_coor * self.num_joint_of_part * 3) |
|||
self.drop = DropPath(0.5) |
|||
# check part_list |
|||
for part in part_list: |
|||
assert len(part) == 3 # each part should have 3 joints |
|||
for idx in part: |
|||
assert 0 <= idx < d_joint # joint index should be less than d_joint |
|||
|
|||
self.idx_joint2part = torch.tensor([idx for part in part_list for idx in part], dtype=torch.long) |
|||
self.idx_joint2part = self.idx_joint2part.flatten().cuda() |
|||
idx_part2joint = list(range(d_joint)) |
|||
for i, idx in enumerate(self.idx_joint2part): |
|||
idx_part2joint[idx] = i |
|||
self.idx_part2joint = torch.tensor(idx_part2joint, dtype=torch.long).cuda() |
|||
|
|||
self.overlap = self.get_overlap() |
|||
|
|||
# 查找有重叠的内容 |
|||
def get_overlap(self): |
|||
overlap_list = [-1] * self.d_joint |
|||
for i, idx in enumerate(self.idx_joint2part): |
|||
if overlap_list[idx] == -1: |
|||
overlap_list[idx] = i |
|||
else: |
|||
if not isinstance(overlap_list[idx], list): |
|||
overlap_i = overlap_list[idx] |
|||
overlap_list[idx] = list() |
|||
overlap_list[idx].append(overlap_i) |
|||
overlap_list[idx].append(i) |
|||
|
|||
overlap = [] |
|||
for i in overlap_list: |
|||
if isinstance(i, list): |
|||
overlap.append(i) |
|||
|
|||
if len(overlap) == 0: |
|||
return None |
|||
else: |
|||
return overlap |
|||
|
|||
def forward(self, input): |
|||
input = torch.index_select(input, 2, self.idx_joint2part) |
|||
input = rearrange(input, 'b t (p j) c -> b t p (j c)', j=self.num_joint_of_part) |
|||
|
|||
b, t, p, c = input.shape |
|||
|
|||
emb = self.pos_embed(self.frame_idx) |
|||
input = input + emb[None, :, None, :] |
|||
|
|||
input = self.layer_norm(input) |
|||
|
|||
qkv = self.qkv(input) # b, t, p, c-> b, t, p, 3*c |
|||
qkv_t = qkv.reshape(b, t, p, c, 3).permute(4, 0, 1, 2, 3) # 3,b,t,p,c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,p,c |
|||
|
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) # b,t,p,c -> b*h*p,t,c//h |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t', h=self.head) # b,t,p,c-> b*h*p,c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h*p,t,t |
|||
att_t = att_t.softmax(-1) # b*h*p,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t p c -> b c t p') |
|||
|
|||
# MSA |
|||
v_t = rearrange(v_t, 'b (h c) t p -> (b h p) t c ', h=self.head) # b*h*p,t,c//h |
|||
|
|||
x_t = att_t @ v_t # b*h*p,t,c//h |
|||
|
|||
x = rearrange(x_t, '(b h p) t c -> b h t p c', h=self.head, p=p) # b*h*p,t,c//h -> b,h,t,p,c//h |
|||
|
|||
# 还原 |
|||
x = rearrange(x, 'b h t p c -> b t p (h c)') |
|||
x = rearrange(x, 'b t p (j c) -> b t (p j) c', j=self.num_joint_of_part) |
|||
|
|||
# 查找重叠 求均值 |
|||
if self.overlap: |
|||
for overlap in self.overlap: |
|||
idx = overlap[-1] |
|||
for i in overlap[:-1]: |
|||
x[:, :, idx, :] += x[:, :, i, :] |
|||
x[:, :, idx, :] /= len(overlap) |
|||
|
|||
x = torch.index_select(x, 2, self.idx_part2joint) |
|||
return x |
|||
|
|||
|
|||
class Pose_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
self.head = head |
|||
|
|||
self.scale = (d_coor * d_joint) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.layer_norm = nn.LayerNorm(d_coor * self.d_joint) |
|||
|
|||
self.pos_emb = nn.Embedding(d_time, d_coor * d_joint) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.qkv = nn.Linear(d_coor * d_joint, d_coor * d_joint * 3) |
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
input = rearrange(input, 'b t s c -> b t (s c)') |
|||
|
|||
emb = self.pos_emb(self.frame_idx) |
|||
input = input + emb[None, :] |
|||
|
|||
input = self.layer_norm(input) |
|||
|
|||
qkv = self.qkv(input) # b, t, s*c -> b, t, 3*s*c |
|||
qkv_t = qkv.reshape(b, t, s*c, 3).permute(3, 0, 1, 2) # 3,b,t,s*c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,s*c |
|||
|
|||
# reshape for mat |
|||
q_t = rearrange(q_t, 'b t (h c) -> (b h) t c', h=self.head) # b,t,s*c -> b*h,t,s*c//h |
|||
k_t = rearrange(k_t, 'b t (h c) -> (b h) c t ', h=self.head) # b,t,s*c-> b*h,s*c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h,t,t |
|||
att_t = att_t.softmax(-1) # b*h,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t (h c) -> (b h) t c', h=self.head) # b*h,t,s*c//h |
|||
|
|||
x_t = att_t @ v_t # b*h,t,s*c//h |
|||
|
|||
x_t = rearrange(x_t, '(b h) t (s c) -> b t s (h c) ', h=self.head, s=s) # b*h,t,s*c//h -> b,t,s,c |
|||
|
|||
return x_t |
|||
|
|||
|
|||
class HP_BLOCK(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, part_list): |
|||
super().__init__() |
|||
|
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.mlp = Mlp(d_coor, d_coor*4, d_coor) |
|||
|
|||
self.joint_att = Joint_ATTENTION(d_time, d_joint, d_coor//3) |
|||
self.part_att = Part_ATTENTION(d_time, d_joint, d_coor//3, part_list) |
|||
self.pose_att = Pose_ATTENTION(d_time, d_joint, d_coor//3) |
|||
|
|||
self.drop = DropPath(0.0) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
h = input |
|||
# x = self.layer_norm(input) |
|||
|
|||
x_joint, x_part, x_pose = input.chunk(3, 3) |
|||
|
|||
x = torch.cat(( |
|||
self.joint_att(x_joint), |
|||
self.part_att(x_part), |
|||
self.pose_att(x_pose) |
|||
), -1) |
|||
|
|||
x = x + h |
|||
x = x + self.drop(self.mlp(self.layer_norm(x))) |
|||
|
|||
return x |
|||
|
|||
|
|||
class HPFormer(nn.Module): |
|||
def __init__(self, num_block, d_time, d_joint, d_coor, part_list): |
|||
super(HPFormer, self).__init__() |
|||
|
|||
self.num_block = num_block |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.d_coor = d_coor |
|||
self.part_list = part_list |
|||
|
|||
self.hp_block = [] |
|||
for l in range(self.num_block): |
|||
self.hp_block.append(HP_BLOCK(self.d_time, self.d_joint, self.d_coor, self.part_list)) |
|||
self.hp_block = nn.ModuleList(self.hp_block) |
|||
|
|||
def forward(self, input): |
|||
for i in range(self.num_block): |
|||
input = self.hp_block[i](input) |
|||
|
|||
return input |
|||
|
|||
|
|||
class Model(nn.Module): |
|||
def __init__(self, args): |
|||
super().__init__() |
|||
|
|||
layers, d_hid, frames = args.layers, args.d_hid, args.frames |
|||
num_joints_in, num_joints_out = args.n_joints, args.out_joints |
|||
part_list = args.part_list |
|||
|
|||
# layers, length, d_hid = layers, frames, d_hid |
|||
# num_joints_in, num_joints_out = 17,17 |
|||
|
|||
self.pose_emb = nn.Linear(2, d_hid, bias=False) |
|||
self.gelu = nn.GELU() |
|||
self.hpformer = HPFormer(layers, frames, num_joints_in, d_hid, part_list) |
|||
self.regress_head = nn.Linear(d_hid, 3, bias=False) |
|||
|
|||
def forward(self, x): |
|||
# b, t, s, c = x.shape #batch,frame,joint,coordinate |
|||
# dimension tranfer |
|||
x = self.pose_emb(x) |
|||
x = self.gelu(x) |
|||
# spatio-temporal correlation |
|||
x = self.hpformer(x) |
|||
# regression head |
|||
x = self.regress_head(x) |
|||
|
|||
return x |
|||
|
|||
class Args: |
|||
def __init__(self, layers, d_hid, frames, n_joints, out_joints): |
|||
self.layers = layers |
|||
self.d_hid = d_hid |
|||
self.frames = frames |
|||
self.n_joints = n_joints |
|||
self.out_joints = out_joints |
|||
|
|||
if __name__ == "__main__": |
|||
# inputs = torch.rand(64, 351, 34) # [btz, channel, T, H, W] |
|||
# inputs = torch.rand(1, 64, 4, 112, 112) #[btz, channel, T, H, W] |
|||
args = Args(layers=6, d_hid=192, frames=27, n_joints=17, out_joints=17) |
|||
args.part_list = [ |
|||
[8, 9, 10], # 头 |
|||
[0, 7, 8], # 身体 |
|||
[11, 12, 13], # 左手 |
|||
[14, 15, 16], # 右手 |
|||
[4, 5, 6], # 左腿 |
|||
[1, 2, 3] #右腿 |
|||
] |
|||
net = Model(args) |
|||
inputs = torch.rand([1, 27, 17, 2]) |
|||
if torch.cuda.is_available(): |
|||
net = net.cuda() |
|||
inputs = inputs.cuda() |
|||
output = net(inputs) |
|||
print(output.size()) |
|||
|
|||
from thop import profile |
|||
# flops = 2*macs, 计算模型的计算量和参数量 |
|||
macs, params = profile(net, inputs=(inputs,)) |
|||
print(2*macs) |
|||
print(params) |
@ -0,0 +1,400 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
# from model.module.trans import Transformer as Transformer_s |
|||
# from model.module.trans_hypothesis import Transformer |
|||
import numpy as np |
|||
from einops import rearrange |
|||
from collections import OrderedDict |
|||
from torch.nn import functional as F |
|||
from torch.nn import init |
|||
import scipy.sparse as sp |
|||
|
|||
from timm.models.layers import DropPath |
|||
|
|||
|
|||
|
|||
class Mlp(nn.Module): |
|||
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.1): |
|||
super().__init__() |
|||
out_features = out_features or in_features |
|||
hidden_features = hidden_features or in_features |
|||
self.fc1 = nn.Linear(in_features, hidden_features, bias=False) |
|||
self.act = act_layer() |
|||
self.fc2 = nn.Linear(hidden_features, out_features, bias=False) |
|||
self.drop = nn.Dropout(drop) |
|||
|
|||
def forward(self, x): |
|||
x = self.fc1(x) |
|||
x = self.act(x) |
|||
x = self.drop(x) |
|||
x = self.fc2(x) |
|||
x = self.drop(x) |
|||
return x |
|||
|
|||
|
|||
class Joint_ATTENTION_Spatial(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
self.qkv = nn.Linear(d_coor, d_coor * 3) |
|||
self.head = head |
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.scale = (d_coor) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
|
|||
self.pos_emb = nn.Embedding(d_joint, d_coor) |
|||
self.frame_idx = torch.tensor(list(range(d_joint))).long().cuda() |
|||
|
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
|
|||
emb = self.pos_emb(self.frame_idx) |
|||
input = input + emb[None, None, :, :] |
|||
|
|||
input = self.layer_norm(input) |
|||
|
|||
qkv = self.qkv(input) |
|||
qkv_s = qkv.reshape(b, t, s, c, 3).permute(4, 0, 1, 2, 3) |
|||
|
|||
|
|||
class Joint_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8, is_spatial=False): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
self.qkv = nn.Linear(d_coor, d_coor * 3) |
|||
self.head = head |
|||
|
|||
self.scale = (d_coor) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.is_spatial = is_spatial |
|||
|
|||
if is_spatial: |
|||
self.pos_emb = nn.Embedding(d_time, d_coor) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
else: |
|||
self.pos_emb = nn.Embedding(d_joint, d_coor) |
|||
self.frame_idx = torch.tensor(list(range(d_joint))).long().cuda() |
|||
|
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
|
|||
emb = self.pos_emb(self.frame_idx) |
|||
if self.is_spatial: |
|||
input = input + emb[None, None, :, :] |
|||
else: |
|||
input = input + emb[None, :, None, :] |
|||
|
|||
qkv = self.qkv(input) # b, t, s, c-> b, t, s, 3*c |
|||
qkv = qkv.reshape(b, t, s, c, 3).permute(4, 0, 1, 2, 3) # 3,b,t,s,c |
|||
|
|||
q, k, v = qkv[0], qkv[1], qkv[2] # b,t,s,c |
|||
|
|||
if self.is_spatial: |
|||
q = rearrange(q, 'b t s (h c) -> (b h t) s c', h=self.head) # b,t,s,c -> b*h*t,s,c//h |
|||
k = rearrange(k, 'b t s (h c) -> (b h t) c s ', h=self.head) # b,t,s,c-> b*h*t,c//h,s |
|||
else: |
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) # b,t,s,c -> b*h*s,t,c//h |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t ', h=self.head) # b,t,s,c-> b*h*s,c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h*s,t,t |
|||
att_t = att_t.softmax(-1) # b*h*s,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t s c -> b c t s ') |
|||
|
|||
# MSA |
|||
v_t = rearrange(v_t, 'b (h c) t s -> (b h s) t c', h = self.head) # b*h*s,t,c//h |
|||
|
|||
x_t = att_t @ v_t # b*h*s,t,c//h |
|||
|
|||
x_t = rearrange(x_t, '(b h s) t c -> b t s (h c)', s=s, h=self.head) # b,t,s,c |
|||
|
|||
return x_t |
|||
|
|||
|
|||
class Part_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, part_list, head=8): |
|||
super().__init__() |
|||
|
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
|
|||
self.head = head |
|||
|
|||
self.num_of_part = len(part_list) |
|||
self.num_joint_of_part = len(part_list[0]) |
|||
|
|||
self.scale = (d_coor * self.num_joint_of_part) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.layer_norm = nn.LayerNorm(d_coor * self.num_joint_of_part) |
|||
|
|||
self.pos_embed = nn.Embedding(d_time, d_coor * self.num_joint_of_part) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.qkv = nn.Linear(d_coor * self.num_joint_of_part, d_coor * self.num_joint_of_part * 3) |
|||
self.drop = DropPath(0.5) |
|||
# check part_list |
|||
for part in part_list: |
|||
assert len(part) == 3 # each part should have 3 joints |
|||
for idx in part: |
|||
assert 0 <= idx < d_joint # joint index should be less than d_joint |
|||
|
|||
self.idx_joint2part = torch.tensor([idx for part in part_list for idx in part], dtype=torch.long) |
|||
self.idx_joint2part = self.idx_joint2part.flatten().cuda() |
|||
idx_part2joint = list(range(d_joint)) |
|||
for i, idx in enumerate(self.idx_joint2part): |
|||
idx_part2joint[idx] = i |
|||
self.idx_part2joint = torch.tensor(idx_part2joint, dtype=torch.long).cuda() |
|||
|
|||
self.overlap = self.get_overlap() |
|||
|
|||
# 查找有重叠的内容 |
|||
def get_overlap(self): |
|||
overlap_list = [-1] * self.d_joint |
|||
for i, idx in enumerate(self.idx_joint2part): |
|||
if overlap_list[idx] == -1: |
|||
overlap_list[idx] = i |
|||
else: |
|||
if not isinstance(overlap_list[idx], list): |
|||
overlap_i = overlap_list[idx] |
|||
overlap_list[idx] = list() |
|||
overlap_list[idx].append(overlap_i) |
|||
overlap_list[idx].append(i) |
|||
|
|||
overlap = [] |
|||
for i in overlap_list: |
|||
if isinstance(i, list): |
|||
overlap.append(i) |
|||
|
|||
if len(overlap) == 0: |
|||
return None |
|||
else: |
|||
return overlap |
|||
|
|||
def forward(self, input): |
|||
input = torch.index_select(input, 2, self.idx_joint2part) |
|||
input = rearrange(input, 'b t (p j) c -> b t p (j c)', j=self.num_joint_of_part) |
|||
|
|||
b, t, p, c = input.shape |
|||
|
|||
emb = self.pos_embed(self.frame_idx) |
|||
input = input + emb[None, :, None, :] |
|||
|
|||
qkv = self.qkv(input) # b, t, p, c-> b, t, p, 3*c |
|||
qkv_t = qkv.reshape(b, t, p, c, 3).permute(4, 0, 1, 2, 3) # 3,b,t,p,c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,p,c |
|||
|
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) # b,t,p,c -> b*h*p,t,c//h |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t', h=self.head) # b,t,p,c-> b*h*p,c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h*p,t,t |
|||
att_t = att_t.softmax(-1) # b*h*p,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t p c -> b c t p') |
|||
|
|||
# MSA |
|||
v_t = rearrange(v_t, 'b (h c) t p -> (b h p) t c ', h=self.head) # b*h*p,t,c//h |
|||
|
|||
x_t = att_t @ v_t # b*h*p,t,c//h |
|||
|
|||
x = rearrange(x_t, '(b h p) t c -> b h t p c', h=self.head, p=p) # b*h*p,t,c//h -> b,h,t,p,c//h |
|||
|
|||
# 还原 |
|||
x = rearrange(x, 'b h t p c -> b t p (h c)') |
|||
x = rearrange(x, 'b t p (j c) -> b t (p j) c', j=self.num_joint_of_part) |
|||
|
|||
# 查找重叠 求均值 |
|||
if self.overlap: |
|||
for overlap in self.overlap: |
|||
idx = overlap[-1] |
|||
for i in overlap[:-1]: |
|||
x[:, :, idx, :] += x[:, :, i, :] |
|||
x[:, :, idx, :] /= len(overlap) |
|||
|
|||
x = torch.index_select(x, 2, self.idx_part2joint) |
|||
return x |
|||
|
|||
|
|||
class Pose_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
self.head = head |
|||
|
|||
self.scale = (d_coor * d_joint) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
|
|||
self.pos_emb = nn.Embedding(d_time, d_coor * d_joint) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.qkv = nn.Linear(d_coor * d_joint, d_coor * d_joint * 3) |
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
input = rearrange(input, 'b t s c -> b t (s c)') |
|||
|
|||
emb = self.pos_emb(self.frame_idx) |
|||
input = input + emb[None, :] |
|||
|
|||
qkv = self.qkv(input) # b, t, s*c -> b, t, 3*s*c |
|||
qkv_t = qkv.reshape(b, t, s*c, 3).permute(3, 0, 1, 2) # 3,b,t,s*c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,s*c |
|||
|
|||
# reshape for mat |
|||
q_t = rearrange(q_t, 'b t (h c) -> (b h) t c', h=self.head) # b,t,s*c -> b*h,t,s*c//h |
|||
k_t = rearrange(k_t, 'b t (h c) -> (b h) c t ', h=self.head) # b,t,s*c-> b*h,s*c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h,t,t |
|||
att_t = att_t.softmax(-1) # b*h,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t (h c) -> (b h) t c', h=self.head) # b*h,t,s*c//h |
|||
|
|||
x_t = att_t @ v_t # b*h,t,s*c//h |
|||
|
|||
x_t = rearrange(x_t, '(b h) t (s c) -> b t s (h c) ', h=self.head, s=s) # b*h,t,s*c//h -> b,t,s,c |
|||
|
|||
return x_t |
|||
|
|||
|
|||
class HP_BLOCK(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, part_list): |
|||
super().__init__() |
|||
|
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.mlp = Mlp(d_coor, d_coor*4, d_coor) |
|||
|
|||
self.joint_att = Joint_ATTENTION(d_time, d_joint, d_coor//3) |
|||
self.part_att = Part_ATTENTION(d_time, d_joint, d_coor//3, part_list) |
|||
self.pose_att = Pose_ATTENTION(d_time, d_joint, d_coor//3) |
|||
|
|||
self.drop = DropPath(0.0) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
h = input |
|||
x = self.layer_norm(input) |
|||
|
|||
x_joint, x_part, x_pose = x.chunk(3, 3) |
|||
|
|||
x = torch.cat(( |
|||
self.joint_att(x_joint), |
|||
self.part_att(x_part), |
|||
self.pose_att(x_pose) |
|||
), -1) |
|||
|
|||
x = x + h |
|||
x = x + self.drop(self.mlp(self.layer_norm(x))) |
|||
|
|||
return x |
|||
|
|||
|
|||
class HPFormer(nn.Module): |
|||
def __init__(self, num_block, d_time, d_joint, d_coor, part_list): |
|||
super(HPFormer, self).__init__() |
|||
|
|||
self.num_block = num_block |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.d_coor = d_coor |
|||
self.part_list = part_list |
|||
|
|||
self.hp_block = [] |
|||
for l in range(self.num_block): |
|||
self.hp_block.append(HP_BLOCK(self.d_time, self.d_joint, self.d_coor, self.part_list)) |
|||
self.hp_block = nn.ModuleList(self.hp_block) |
|||
|
|||
def forward(self, input): |
|||
for i in range(self.num_block): |
|||
input = self.hp_block[i](input) |
|||
|
|||
return input |
|||
|
|||
|
|||
class Model(nn.Module): |
|||
def __init__(self, args): |
|||
super().__init__() |
|||
|
|||
layers, d_hid, frames = args.layers, args.d_hid, args.frames |
|||
num_joints_in, num_joints_out = args.n_joints, args.out_joints |
|||
part_list = args.part_list |
|||
|
|||
# layers, length, d_hid = layers, frames, d_hid |
|||
# num_joints_in, num_joints_out = 17,17 |
|||
|
|||
self.pose_emb = nn.Linear(2, d_hid, bias=False) |
|||
self.gelu = nn.GELU() |
|||
self.hpformer = HPFormer(layers, frames, num_joints_in, d_hid, part_list) |
|||
self.regress_head = nn.Linear(d_hid, 3, bias=False) |
|||
|
|||
def forward(self, x): |
|||
# b, t, s, c = x.shape #batch,frame,joint,coordinate |
|||
# dimension tranfer |
|||
x = self.pose_emb(x) |
|||
x = self.gelu(x) |
|||
# spatio-temporal correlation |
|||
x = self.hpformer(x) |
|||
# regression head |
|||
x = self.regress_head(x) |
|||
|
|||
return x |
|||
|
|||
class Args: |
|||
def __init__(self, layers, d_hid, frames, n_joints, out_joints): |
|||
self.layers = layers |
|||
self.d_hid = d_hid |
|||
self.frames = frames |
|||
self.n_joints = n_joints |
|||
self.out_joints = out_joints |
|||
|
|||
if __name__ == "__main__": |
|||
# inputs = torch.rand(64, 351, 34) # [btz, channel, T, H, W] |
|||
# inputs = torch.rand(1, 64, 4, 112, 112) #[btz, channel, T, H, W] |
|||
args = Args(layers=6, d_hid=192, frames=27, n_joints=17, out_joints=17) |
|||
args.part_list = [ |
|||
[8, 9, 10], # 头 |
|||
[0, 7, 8], # 身体 |
|||
[11, 12, 13], # 左手 |
|||
[14, 15, 16], # 右手 |
|||
[4, 5, 6], # 左腿 |
|||
[1, 2, 3] #右腿 |
|||
] |
|||
net = Model(args) |
|||
inputs = torch.rand([1, 27, 17, 2]) |
|||
if torch.cuda.is_available(): |
|||
net = net.cuda() |
|||
inputs = inputs.cuda() |
|||
output = net(inputs) |
|||
print(output.size()) |
|||
|
|||
from thop import profile |
|||
# flops = 2*macs, 计算模型的计算量和参数量 |
|||
macs, params = profile(net, inputs=(inputs,)) |
|||
print(2*macs) |
|||
print(params) |
@ -0,0 +1,355 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
# from model.module.trans import Transformer as Transformer_s |
|||
# from model.module.trans_hypothesis import Transformer |
|||
import numpy as np |
|||
from einops import rearrange |
|||
from collections import OrderedDict |
|||
from torch.nn import functional as F |
|||
from torch.nn import init |
|||
import scipy.sparse as sp |
|||
|
|||
from timm.models.layers import DropPath |
|||
|
|||
|
|||
|
|||
class Mlp(nn.Module): |
|||
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.1): |
|||
super().__init__() |
|||
out_features = out_features or in_features |
|||
hidden_features = hidden_features or in_features |
|||
self.fc1 = nn.Linear(in_features, hidden_features, bias=False) |
|||
self.act = act_layer() |
|||
self.fc2 = nn.Linear(hidden_features, out_features, bias=False) |
|||
self.drop = nn.Dropout(drop) |
|||
|
|||
def forward(self, x): |
|||
x = self.fc1(x) |
|||
x = self.act(x) |
|||
x = self.drop(x) |
|||
x = self.fc2(x) |
|||
x = self.drop(x) |
|||
return x |
|||
|
|||
|
|||
class Joint_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
self.qkv = nn.Linear(d_coor, d_coor * 3) |
|||
self.head = head |
|||
|
|||
self.scale = (d_coor) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
|
|||
self.pos_emb = nn.Embedding(d_time, d_coor) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
|
|||
emb = self.pos_emb(self.frame_idx) |
|||
input = input + emb[None, :, None, :] |
|||
|
|||
qkv = self.qkv(input) # b, t, s, c-> b, t, s, 3*c |
|||
qkv_t = qkv.reshape(b, t, s, c, 3).permute(4, 0, 1, 2, 3) # 3,b,t,s,c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,s,c |
|||
|
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) # b,t,s,c -> b*h*s,t,c//h |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t ', h=self.head) # b,t,s,c-> b*h*s,c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h*s,t,t |
|||
att_t = att_t.softmax(-1) # b*h*s,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t s c -> b c t s ') |
|||
|
|||
# MSA |
|||
v_t = rearrange(v_t, 'b (h c) t s -> (b h s) t c', h = self.head) # b*h*s,t,c//h |
|||
|
|||
x_t = att_t @ v_t # b*h*s,t,c//h |
|||
|
|||
x_t = rearrange(x_t, '(b h s) t c -> b t s (h c)', s=s, h=self.head) # b,t,s,c |
|||
|
|||
return x_t |
|||
|
|||
|
|||
class Part_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, part_list, head=8): |
|||
super().__init__() |
|||
|
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
|
|||
self.head = head |
|||
|
|||
self.num_of_part = len(part_list) |
|||
self.num_joint_of_part = len(part_list[0]) |
|||
|
|||
self.scale = (d_coor * self.num_joint_of_part) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.layer_norm = nn.LayerNorm(d_coor * self.num_joint_of_part) |
|||
|
|||
self.pos_embed = nn.Embedding(d_time, d_coor * self.num_joint_of_part) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.qkv = nn.Linear(d_coor * self.num_joint_of_part, d_coor * self.num_joint_of_part * 3) |
|||
self.drop = DropPath(0.5) |
|||
# check part_list |
|||
for part in part_list: |
|||
assert len(part) == 3 # each part should have 3 joints |
|||
for idx in part: |
|||
assert 0 <= idx < d_joint # joint index should be less than d_joint |
|||
|
|||
self.idx_joint2part = torch.tensor([idx for part in part_list for idx in part], dtype=torch.long) |
|||
self.idx_joint2part = self.idx_joint2part.flatten().cuda() |
|||
idx_part2joint = list(range(d_joint)) |
|||
for i, idx in enumerate(self.idx_joint2part): |
|||
idx_part2joint[idx] = i |
|||
self.idx_part2joint = torch.tensor(idx_part2joint, dtype=torch.long).cuda() |
|||
|
|||
self.overlap = self.get_overlap() |
|||
|
|||
# 查找有重叠的内容 |
|||
def get_overlap(self): |
|||
overlap_list = [-1] * self.d_joint |
|||
for i, idx in enumerate(self.idx_joint2part): |
|||
if overlap_list[idx] == -1: |
|||
overlap_list[idx] = i |
|||
else: |
|||
if not isinstance(overlap_list[idx], list): |
|||
overlap_i = overlap_list[idx] |
|||
overlap_list[idx] = list() |
|||
overlap_list[idx].append(overlap_i) |
|||
overlap_list[idx].append(i) |
|||
|
|||
overlap = [] |
|||
for i in overlap_list: |
|||
if isinstance(i, list): |
|||
overlap.append(i) |
|||
|
|||
if len(overlap) == 0: |
|||
return None |
|||
else: |
|||
return overlap |
|||
|
|||
def forward(self, input): |
|||
input = torch.index_select(input, 2, self.idx_joint2part) |
|||
input = rearrange(input, 'b t (p j) c -> b t p (j c)', j=self.num_joint_of_part) |
|||
|
|||
b, t, p, c = input.shape |
|||
|
|||
emb = self.pos_embed(self.frame_idx) |
|||
input = input + emb[None, :, None, :] |
|||
|
|||
qkv = self.qkv(input) # b, t, p, c-> b, t, p, 3*c |
|||
qkv_t = qkv.reshape(b, t, p, c, 3).permute(4, 0, 1, 2, 3) # 3,b,t,p,c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,p,c |
|||
|
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) # b,t,p,c -> b*h*p,t,c//h |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t', h=self.head) # b,t,p,c-> b*h*p,c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h*p,t,t |
|||
att_t = att_t.softmax(-1) # b*h*p,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t p c -> b c t p') |
|||
|
|||
# MSA |
|||
v_t = rearrange(v_t, 'b (h c) t p -> (b h p) t c ', h=self.head) # b*h*p,t,c//h |
|||
|
|||
x_t = att_t @ v_t # b*h*p,t,c//h |
|||
|
|||
x = rearrange(x_t, '(b h p) t c -> b h t p c', h=self.head, p=p) # b*h*p,t,c//h -> b,h,t,p,c//h |
|||
|
|||
# 还原 |
|||
x = rearrange(x, 'b h t p c -> b t p (h c)') |
|||
x = rearrange(x, 'b t p (j c) -> b t (p j) c', j=self.num_joint_of_part) |
|||
|
|||
# 查找重叠 求均值 |
|||
if self.overlap: |
|||
for overlap in self.overlap: |
|||
idx = overlap[-1] |
|||
for i in overlap[:-1]: |
|||
x[:, :, idx, :] += x[:, :, i, :] |
|||
x[:, :, idx, :] /= len(overlap) |
|||
|
|||
x = torch.index_select(x, 2, self.idx_part2joint) |
|||
return x |
|||
|
|||
|
|||
class Pose_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
self.head = head |
|||
|
|||
self.scale = (d_coor * d_joint) ** -0.5 |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
|
|||
self.pos_emb = nn.Embedding(d_time, d_coor * d_joint) |
|||
self.frame_idx = torch.tensor(list(range(d_time))).long().cuda() |
|||
|
|||
self.qkv = nn.Linear(d_coor * d_joint, d_coor * d_joint * 3) |
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
input = rearrange(input, 'b t s c -> b t (s c)') |
|||
|
|||
emb = self.pos_emb(self.frame_idx) |
|||
input = input + emb[None, :] |
|||
|
|||
qkv = self.qkv(input) # b, t, s*c -> b, t, 3*s*c |
|||
qkv_t = qkv.reshape(b, t, s*c, 3).permute(3, 0, 1, 2) # 3,b,t,s*c |
|||
|
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,s*c |
|||
|
|||
# reshape for mat |
|||
q_t = rearrange(q_t, 'b t (h c) -> (b h) t c', h=self.head) # b,t,s*c -> b*h,t,s*c//h |
|||
k_t = rearrange(k_t, 'b t (h c) -> (b h) c t ', h=self.head) # b,t,s*c-> b*h,s*c//h,t |
|||
|
|||
att_t = (q_t @ k_t) * self.scale # b*h,t,t |
|||
att_t = att_t.softmax(-1) # b*h,t,t |
|||
|
|||
v_t = rearrange(v_t, 'b t (h c) -> (b h) t c', h=self.head) # b*h,t,s*c//h |
|||
|
|||
x_t = att_t @ v_t # b*h,t,s*c//h |
|||
|
|||
x_t = rearrange(x_t, '(b h) t (s c) -> b t s (h c) ', h=self.head, s=s) # b*h,t,s*c//h -> b,t,s,c |
|||
|
|||
return x_t |
|||
|
|||
|
|||
class HP_BLOCK(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, part_list): |
|||
super().__init__() |
|||
|
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.mlp = Mlp(d_coor, d_coor*4, d_coor) |
|||
|
|||
self.joint_att = Joint_ATTENTION(d_time, d_joint, d_coor//3) |
|||
self.part_att = Part_ATTENTION(d_time, d_joint, d_coor//3, part_list) |
|||
self.pose_att = Pose_ATTENTION(d_time, d_joint, d_coor//3) |
|||
|
|||
self.drop = DropPath(0.0) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
h = input |
|||
x = self.layer_norm(input) |
|||
|
|||
x_joint, x_part, x_pose = x.chunk(3, 3) |
|||
|
|||
x = torch.cat(( |
|||
self.joint_att(x_joint), |
|||
self.part_att(x_part), |
|||
self.pose_att(x_pose) |
|||
), -1) |
|||
|
|||
x = x + h |
|||
x = x + self.drop(self.mlp(self.layer_norm(x))) |
|||
|
|||
return x |
|||
|
|||
|
|||
class HPFormer(nn.Module): |
|||
def __init__(self, num_block, d_time, d_joint, d_coor, part_list): |
|||
super(HPFormer, self).__init__() |
|||
|
|||
self.num_block = num_block |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.d_coor = d_coor |
|||
self.part_list = part_list |
|||
|
|||
self.hp_block = [] |
|||
for l in range(self.num_block): |
|||
self.hp_block.append(HP_BLOCK(self.d_time, self.d_joint, self.d_coor, self.part_list)) |
|||
self.hp_block = nn.ModuleList(self.hp_block) |
|||
|
|||
def forward(self, input): |
|||
for i in range(self.num_block): |
|||
input = self.hp_block[i](input) |
|||
|
|||
return input |
|||
|
|||
|
|||
class Model(nn.Module): |
|||
def __init__(self, args): |
|||
super().__init__() |
|||
|
|||
layers, d_hid, frames = args.layers, args.d_hid, args.frames |
|||
num_joints_in, num_joints_out = args.n_joints, args.out_joints |
|||
part_list = args.part_list |
|||
|
|||
# layers, length, d_hid = layers, frames, d_hid |
|||
# num_joints_in, num_joints_out = 17,17 |
|||
|
|||
self.pose_emb = nn.Linear(2, d_hid, bias=False) |
|||
self.gelu = nn.GELU() |
|||
self.hpformer = HPFormer(layers, frames, num_joints_in, d_hid, part_list) |
|||
self.regress_head = nn.Linear(d_hid, 3, bias=False) |
|||
|
|||
def forward(self, x): |
|||
# b, t, s, c = x.shape #batch,frame,joint,coordinate |
|||
# dimension tranfer |
|||
x = self.pose_emb(x) |
|||
x = self.gelu(x) |
|||
# spatio-temporal correlation |
|||
x = self.hpformer(x) |
|||
# regression head |
|||
x = self.regress_head(x) |
|||
|
|||
return x |
|||
|
|||
class Args: |
|||
def __init__(self, layers, d_hid, frames, n_joints, out_joints): |
|||
self.layers = layers |
|||
self.d_hid = d_hid |
|||
self.frames = frames |
|||
self.n_joints = n_joints |
|||
self.out_joints = out_joints |
|||
|
|||
if __name__ == "__main__": |
|||
# inputs = torch.rand(64, 351, 34) # [btz, channel, T, H, W] |
|||
# inputs = torch.rand(1, 64, 4, 112, 112) #[btz, channel, T, H, W] |
|||
args = Args(layers=6, d_hid=192, frames=27, n_joints=17, out_joints=17) |
|||
args.part_list = [ |
|||
[8, 9, 10], # 头 |
|||
[0, 7, 8], # 身体 |
|||
[11, 12, 13], # 左手 |
|||
[14, 15, 16], # 右手 |
|||
[4, 5, 6], # 左腿 |
|||
[1, 2, 3] #右腿 |
|||
] |
|||
net = Model(args) |
|||
inputs = torch.rand([1, 27, 17, 2]) |
|||
if torch.cuda.is_available(): |
|||
net = net.cuda() |
|||
inputs = inputs.cuda() |
|||
output = net(inputs) |
|||
print(output.size()) |
|||
|
|||
from thop import profile |
|||
# flops = 2*macs, 计算模型的计算量和参数量 |
|||
macs, params = profile(net, inputs=(inputs,)) |
|||
print(2*macs) |
|||
print(params) |
@ -0,0 +1,419 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
# from model.module.trans import Transformer as Transformer_s |
|||
# from model.module.trans_hypothesis import Transformer |
|||
import numpy as np |
|||
from einops import rearrange |
|||
from collections import OrderedDict |
|||
from torch.nn import functional as F |
|||
from torch.nn import init |
|||
import scipy.sparse as sp |
|||
|
|||
from timm.models.layers import DropPath |
|||
|
|||
|
|||
class Model(nn.Module): |
|||
def __init__(self, args): |
|||
super().__init__() |
|||
|
|||
layers, channel, d_hid, length = args.layers, args.channel, args.d_hid, args.frames |
|||
self.num_joints_in, self.num_joints_out = args.n_joints, args.out_joints |
|||
args.d_hid = 256 |
|||
isTrainning = args.train |
|||
|
|||
# dimension tranfer |
|||
self.pose_emb = nn.Linear(2, args.d_hid, bias=False) |
|||
self.gelu = nn.GELU() |
|||
|
|||
# self.flow_emb = nn.Linear(2, args.d_hid, bias=False) |
|||
# self.gelu = nn.GELU() |
|||
|
|||
self.mlpmixer = MlpMixer(6, args.frames, 17, args.d_hid, isTrainning) |
|||
|
|||
self.pose_lift = nn.Linear(args.d_hid, 3, bias=False) |
|||
|
|||
# self.sequence_pos_encoder = PositionalEncoding(args.d_hid, 0.1) |
|||
|
|||
# self.tem_pool = nn.AdaptiveAvgPool1d(1) |
|||
# self.lpm = LearnedPosMap2D(args.frames,18) |
|||
|
|||
def forward(self, x): |
|||
#x = x[:, :, :, :, 0].permute(0, 2, 3, 1).contiguous() # B,T,J,2,1 |
|||
x = x[:, :, :, :, 0].permute(0, 2, 3, 1).contiguous() # B,T,J,2,1 |
|||
#x = x.view(x.shape[0], x.shape[1], x.shape[2], -1) # b,t,j,2 |
|||
|
|||
b, t, j, c = x.shape |
|||
|
|||
#g = torch.zeros([b,t,1,c]).cuda() |
|||
#x = torch.cat((x,g),-2) |
|||
|
|||
x = self.pose_emb(x) |
|||
x = self.gelu(x) |
|||
|
|||
|
|||
# x = x.reshape(b,t,j,c) |
|||
|
|||
x = self.mlpmixer(x) |
|||
|
|||
|
|||
x = self.pose_lift(x) |
|||
|
|||
return x |
|||
|
|||
|
|||
def normalize(mx): |
|||
"""Row-normalize sparse matrix""" |
|||
rowsum = np.array(mx.sum(1)) |
|||
r_inv = np.power(rowsum, -1).flatten() |
|||
r_inv[np.isinf(r_inv)] = 0. |
|||
r_mat_inv = sp.diags(r_inv) |
|||
mx = r_mat_inv.dot(mx) |
|||
return mx |
|||
|
|||
|
|||
def sparse_mx_to_torch_sparse_tensor(sparse_mx): |
|||
"""Convert a scipy sparse matrix to a torch sparse tensor.""" |
|||
sparse_mx = sparse_mx.tocoo().astype(np.float32) |
|||
indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) |
|||
values = torch.from_numpy(sparse_mx.data) |
|||
shape = torch.Size(sparse_mx.shape) |
|||
return torch.sparse.FloatTensor(indices, values, shape) |
|||
|
|||
|
|||
def adj_mx_from_edges(num_pts, edges, sparse=False): |
|||
edges = np.array(edges, dtype=np.int32) |
|||
data, i, j = np.ones(edges.shape[0]), edges[:, 0], edges[:, 1] |
|||
adj_mx = sp.coo_matrix((data, (i, j)), shape=(num_pts, num_pts), dtype=np.float32) |
|||
# print(11,adj_mx) |
|||
|
|||
# build symmetric adjacency matrix |
|||
adj_mx = adj_mx + adj_mx.T.multiply(adj_mx.T > adj_mx) - adj_mx.multiply(adj_mx.T > adj_mx) |
|||
# adj_mx = normalize(adj_mx + sp.eye(adj_mx.shape[0])) |
|||
if sparse: |
|||
adj_mx = sparse_mx_to_torch_sparse_tensor(adj_mx) |
|||
else: |
|||
adj_mx = torch.tensor(adj_mx.todense(), dtype=torch.float) |
|||
return adj_mx.sum(-1) |
|||
|
|||
|
|||
class ChebConv(nn.Module): |
|||
""" |
|||
The ChebNet convolution operation. |
|||
:param in_c: int, number of input channels. |
|||
:param out_c: int, number of output channels. |
|||
:param K: int, the order of Chebyshev Polynomial. |
|||
""" |
|||
|
|||
def __init__(self, in_c, out_c, K, bias=True, normalize=True): |
|||
super(ChebConv, self).__init__() |
|||
self.normalize = normalize |
|||
|
|||
self.weight = nn.Parameter(torch.Tensor(K + 1, 1, in_c, out_c)) # [K+1, 1, in_c, out_c] |
|||
init.xavier_normal_(self.weight) |
|||
|
|||
if bias: |
|||
self.bias = nn.Parameter(torch.Tensor(1, 1, out_c)) |
|||
init.zeros_(self.bias) |
|||
else: |
|||
self.register_parameter("bias", None) |
|||
|
|||
self.K = K + 1 |
|||
|
|||
def forward(self, inputs, graph): |
|||
""" |
|||
:param inputs: the input data, [B, N, C] |
|||
:param graph: the graph structure, [N, N] |
|||
:return: convolution result, [B, N, D] |
|||
""" |
|||
L = ChebConv.get_laplacian(graph, self.normalize) # [N, N] |
|||
mul_L = self.cheb_polynomial(L).unsqueeze(1) # [K, 1, N, N] |
|||
|
|||
result = torch.matmul(mul_L, inputs) # [K, B, N, C] |
|||
|
|||
result = torch.matmul(result, self.weight) # [K, B, N, D] |
|||
result = torch.sum(result, dim=0) + self.bias # [B, N, D] |
|||
|
|||
return result |
|||
|
|||
def cheb_polynomial(self, laplacian): |
|||
""" |
|||
Compute the Chebyshev Polynomial, according to the graph laplacian. |
|||
:param laplacian: the graph laplacian, [N, N]. |
|||
:return: the multi order Chebyshev laplacian, [K, N, N]. |
|||
""" |
|||
N = laplacian.size(0) # [N, N] |
|||
multi_order_laplacian = torch.zeros([self.K, N, N], device=laplacian.device, dtype=torch.float) # [K, N, N] |
|||
multi_order_laplacian[0] = torch.eye(N, device=laplacian.device, dtype=torch.float) |
|||
|
|||
if self.K == 1: |
|||
return multi_order_laplacian |
|||
else: |
|||
multi_order_laplacian[1] = laplacian |
|||
if self.K == 2: |
|||
return multi_order_laplacian |
|||
else: |
|||
for k in range(2, self.K): |
|||
multi_order_laplacian[k] = 2 * torch.mm(laplacian, multi_order_laplacian[k - 1]) - \ |
|||
multi_order_laplacian[k - 2] |
|||
|
|||
return multi_order_laplacian |
|||
|
|||
@staticmethod |
|||
def get_laplacian(graph, normalize): |
|||
""" |
|||
return the laplacian of the graph. |
|||
:param graph: the graph structure without self loop, [N, N]. |
|||
:param normalize: whether to used the normalized laplacian. |
|||
:return: graph laplacian. |
|||
""" |
|||
if normalize: |
|||
|
|||
D = torch.diag(torch.sum(graph, dim=-1) ** (-1 / 2)) |
|||
L = torch.eye(graph.size(0), device=graph.device, dtype=graph.dtype) - torch.mm(torch.mm(D, graph), D) |
|||
else: |
|||
D = torch.diag(torch.sum(graph, dim=-1)) |
|||
L = D - graph |
|||
return L |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
class cross_att(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, isTrainning=False, head=4): |
|||
super().__init__() |
|||
|
|||
self.qkv = nn.Linear(d_coor, d_coor * 3) |
|||
self.head = head |
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
# self.lpm_st_1 = LearnedPosMap2D(d_time, d_joint, gamma=4) |
|||
self.scale = d_coor ** -0.5 |
|||
self.proj = nn.Linear(d_coor, d_coor) |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.head = head |
|||
|
|||
# self.gate_s = nn.Conv2d(d_coor//2, d_coor//2, kernel_size=3, stride=1, padding=1,groups=d_coor//2) |
|||
# self.gate_t = nn.Conv2d(d_coor//2, d_coor//2, kernel_size=3, stride=1, padding=1,groups=d_coor//2) |
|||
|
|||
# self.gate_s = MSLSP(d_time, d_joint, d_coor // 2) |
|||
self.gate_t = nn.Conv2d(d_coor//2, d_coor//2, kernel_size=3, stride=1, padding=1,groups=d_coor//2) |
|||
self.gate_s = nn.Conv2d(d_coor//2, d_coor//2, kernel_size=3, stride=1, padding=1,groups=d_coor//2) |
|||
|
|||
# self.gate_gs = ChebConv(d_coor//2, d_coor//2, K=2) |
|||
#self.scf = nn.Parameter(0.0001*torch.Tensor(1,1,d_coor//8)) |
|||
|
|||
#self.weight = nn.Parameter(torch.Tensor(K + 1, 1, in_c, out_c)) # [K+1, 1, in_c, out_c] |
|||
#init.xavier_normal_(self.scf) |
|||
|
|||
|
|||
self.body_edges = torch.tensor([[0, 1], [1, 2], [2, 3], |
|||
[0, 4], [4, 5], [5, 6], |
|||
[0, 7], [7, 8], [8, 9], [9, 10], |
|||
[8, 11], [11, 12], [12, 13], |
|||
[8, 14], [14, 15], [15, 16]], dtype=torch.long) |
|||
# [0,17],[1,17],[2,17],[3,17],[4,17],[5,17],[6,17],[7,17],[8,17],[9,17], |
|||
#[10,17],[11,17],[12,17],[13,17],[14,17],[15,17],[16,17] |
|||
# self.conv_2 = nn.Conv2d(d_coor, d_coor, kernel_size=5, stride=1, padding=2,groups=d_coor) |
|||
self.graph = adj_mx_from_edges(d_joint, self.body_edges).long().cuda() |
|||
self.emb = nn.Embedding(20, d_coor//8, padding_idx=0) |
|||
self.part = torch.tensor([0,0,1,1,1,2,2,2,3,3,3,4,4,4,0,0,0]).long().cuda() |
|||
|
|||
# self.gate_t = MSLSP(d_time, d_joint, d_coor//2) |
|||
|
|||
|
|||
# self.lpm_s = LearnedPosMap2D(d_time,d_joint) |
|||
# self.lpm_t = LearnedPosMap2D(d_time,d_joint) |
|||
|
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
# print(self.scf) |
|||
# exit() |
|||
# input = input + self.lpm_st_1(input) |
|||
h = input |
|||
# print(input.shape) |
|||
# exit() |
|||
x = self.layer_norm(input) |
|||
qkv = self.qkv(x) |
|||
|
|||
qkv = qkv.reshape(b, t, s, c, 3).permute(4, 0, 1, 2, 3) # b,t,s,c |
|||
# print(qkv.shape) |
|||
|
|||
qkv_s, qkv_t = qkv.chunk(2, 4) |
|||
# print(qkv_s.shape,qkv_t.shape) |
|||
|
|||
q_s, k_s, v_s = qkv_s[0], qkv_s[1], qkv_s[2] # b,t,s,c |
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,s,c |
|||
|
|||
# print(q_s.shape,q_t.shape) |
|||
|
|||
q_s = rearrange(q_s, 'b t s (h c) -> (b h t) s c', h=self.head) |
|||
k_s = rearrange(k_s, 'b t s (h c) -> (b h t) c s ', h=self.head) |
|||
|
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t ', h=self.head) |
|||
|
|||
att_s = (q_s @ k_s) * self.scale # b*h,s,s |
|||
att_t = (q_t @ k_t) * self.scale # b*h,s,s |
|||
|
|||
att_s = att_s.softmax(-1) |
|||
att_t = att_t.softmax(-1) |
|||
|
|||
v_s = rearrange(v_s, 'b t s c -> b c t s ') |
|||
v_t = rearrange(v_t, 'b t s c -> b c t s ') |
|||
|
|||
|
|||
# |
|||
# print(v_s.shape,self.graph.shape) |
|||
lep_s = self.gate_s(v_s) |
|||
lep_t = self.gate_t(v_t) |
|||
v_s = rearrange(v_s, 'b c t s -> (b t ) s c') |
|||
# sep_s = self.gate_gs(v_s,self.graph) |
|||
sep_s = self.emb(self.part).unsqueeze(0) |
|||
# print(sep_s.shape) |
|||
|
|||
# sep_s = rearrange(sep_s, '(b t) s (h c) -> (b h t) s c ', t=t,h=self.head) |
|||
|
|||
lep_s = rearrange(lep_s, 'b (h c) t s -> (b h t) s c ', h=self.head) |
|||
lep_t = rearrange(lep_t, 'b (h c) t s -> (b h s) t c ', h=self.head) |
|||
|
|||
|
|||
v_s = rearrange(v_s, '(b t) s (h c) -> (b h t) s c ', t=t,h=self.head) |
|||
# v_s = rearrange(v_s, 'b (h c) t s -> (b h t) s c ', h=self.head) |
|||
v_t = rearrange(v_t, 'b (h c) t s -> (b h s) t c ', h=self.head) |
|||
#print(lep_s[55,:,:]) |
|||
#print(sep_s[55,:,:]) |
|||
#print(self.scf) |
|||
#print(self.scf*sep_s[55,:,:]) |
|||
#exit() |
|||
|
|||
# v = torch.cat((v1, v2), -1) |
|||
|
|||
x_s = att_s @ v_s + lep_s + 0.0001*self.drop(sep_s) # b*h,s,c//h |
|||
x_t = att_t @ v_t + lep_t # b*h,t,c//h |
|||
# print(x_s.shape,x_t.shape) |
|||
|
|||
x_s = rearrange(x_s, '(b h t) s c -> b h t s c ', h=self.head, t=t) |
|||
x_t = rearrange(x_t, '(b h s) t c -> b h t s c ', h=self.head, s=s) |
|||
# print(x_s.shape,x_t.shape) |
|||
x = torch.cat((x_s, x_t), -1) |
|||
x = rearrange(x, 'b h t s c -> b t s (h c) ') |
|||
|
|||
x = self.proj(x) |
|||
# print(x.shape,h.shape) |
|||
x = x + h |
|||
return x |
|||
|
|||
|
|||
class MLP_3D(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, isTrainning=False, ): |
|||
super().__init__() |
|||
|
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.d_coor = d_coor |
|||
|
|||
self.layer_norm1 = nn.LayerNorm(self.d_coor) |
|||
self.layer_norm2 = nn.LayerNorm(self.d_coor) |
|||
|
|||
self.mlp1 = Mlp(self.d_coor, self.d_coor * 4, self.d_coor) |
|||
|
|||
self.cross_att = cross_att(d_time, d_joint, d_coor, isTrainning) |
|||
self.drop = DropPath(0.0) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
|
|||
x = self.cross_att(input) |
|||
|
|||
x = x + self.drop(self.mlp1(self.layer_norm1(x))) |
|||
|
|||
return x |
|||
|
|||
|
|||
class Mlp(nn.Module): |
|||
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.1): |
|||
super().__init__() |
|||
out_features = out_features or in_features |
|||
hidden_features = hidden_features or in_features |
|||
self.fc1 = nn.Linear(in_features, hidden_features, bias=False) |
|||
self.act = act_layer() |
|||
self.fc2 = nn.Linear(hidden_features, out_features, bias=False) |
|||
self.drop = nn.Dropout(drop) |
|||
|
|||
def forward(self, x): |
|||
x = self.fc1(x) |
|||
x = self.act(x) |
|||
x = self.drop(x) |
|||
x = self.fc2(x) |
|||
x = self.drop(x) |
|||
return x |
|||
|
|||
|
|||
class Mlp_C(nn.Module): |
|||
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0): |
|||
super().__init__() |
|||
out_features = out_features or in_features |
|||
hidden_features = hidden_features or in_features |
|||
self.fc1 = nn.Linear(in_features, hidden_features, bias=False) |
|||
self.act = act_layer() |
|||
self.drop = nn.Dropout(drop) |
|||
self.fc2 = nn.Linear(hidden_features, out_features, bias=False) |
|||
self.sig = nn.Sigmoid() |
|||
self.avg = nn.AdaptiveAvgPool2d((1, 1)) |
|||
|
|||
def forward(self, x): |
|||
b, t, s, c = x.shape |
|||
# gate = self.avg(x.permute(0,3,1,2)).permute(0,2,3,1) |
|||
gate = self.fc1(x) |
|||
gate = self.act(gate) |
|||
gate = self.drop(gate) |
|||
gate = self.fc2(gate) |
|||
gate = self.sig(gate) |
|||
# gate = gate.expand(b,t,s,c) |
|||
x = x * gate |
|||
return x |
|||
|
|||
|
|||
class MlpMixer(nn.Module): |
|||
def __init__(self, num_block, d_time, d_joint, d_coor, isTrainning=False, ): |
|||
super(MlpMixer, self).__init__() |
|||
|
|||
self.num_block = num_block |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.d_coor = d_coor |
|||
|
|||
self.mixerblocks = [] |
|||
for l in range(self.num_block): |
|||
self.mixerblocks.append(MLP_3D(self.d_time, self.d_joint, self.d_coor, isTrainning)) |
|||
self.mixerblocks = nn.ModuleList(self.mixerblocks) |
|||
|
|||
def forward(self, input): |
|||
# blocks layers |
|||
for i in range(self.num_block): |
|||
input = self.mixerblocks[i](input) |
|||
# exit() |
|||
|
|||
return input |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
inputs = torch.rand(64, 351, 34) # [btz, channel, T, H, W] |
|||
# inputs = torch.rand(1, 64, 4, 112, 112) #[btz, channel, T, H, W] |
|||
net = Model() |
|||
output = net(inputs) |
|||
print(output.size()) |
|||
from thop import profile |
|||
|
|||
flops, params = profile(net, inputs=(inputs,)) |
|||
print(flops) |
|||
print(params) |
|||
""" |
|||
for name, param in net.named_parameters(): |
|||
if param.requires_grad: |
|||
print(name,':',param.size()) |
|||
""" |
@ -0,0 +1,221 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
# from model.module.trans import Transformer as Transformer_s |
|||
# from model.module.trans_hypothesis import Transformer |
|||
import numpy as np |
|||
from einops import rearrange |
|||
from collections import OrderedDict |
|||
from torch.nn import functional as F |
|||
from torch.nn import init |
|||
import scipy.sparse as sp |
|||
|
|||
from timm.models.layers import DropPath |
|||
|
|||
|
|||
|
|||
class Mlp(nn.Module): |
|||
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.1): |
|||
super().__init__() |
|||
out_features = out_features or in_features |
|||
hidden_features = hidden_features or in_features |
|||
self.fc1 = nn.Linear(in_features, hidden_features, bias=False) |
|||
self.act = act_layer() |
|||
self.fc2 = nn.Linear(hidden_features, out_features, bias=False) |
|||
self.drop = nn.Dropout(drop) |
|||
|
|||
def forward(self, x): |
|||
x = self.fc1(x) |
|||
x = self.act(x) |
|||
x = self.drop(x) |
|||
x = self.fc2(x) |
|||
x = self.drop(x) |
|||
return x |
|||
|
|||
|
|||
class STC_ATTENTION(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor, head=8): |
|||
super().__init__() |
|||
""" |
|||
d_time: 帧数 |
|||
d_joint: 关节点数 |
|||
d_coor: 嵌入维度 |
|||
""" |
|||
# print(d_time, d_joint, d_coor, head) |
|||
self.qkv = nn.Linear(d_coor, d_coor * 3) |
|||
self.head = head |
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.scale = (d_coor // 2) ** -0.5 |
|||
self.proj = nn.Linear(d_coor, d_coor) |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
|
|||
# sep1 |
|||
# print(d_coor) |
|||
self.emb = nn.Embedding(5, d_coor//head//2) |
|||
self.part = torch.tensor([0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 3, 3, 3, 4, 4, 4]).long().cuda() |
|||
|
|||
# sep2 |
|||
self.sep2_t = nn.Conv2d(d_coor // 2, d_coor // 2, kernel_size=3, stride=1, padding=1, groups=d_coor // 2) |
|||
self.sep2_s = nn.Conv2d(d_coor // 2, d_coor // 2, kernel_size=3, stride=1, padding=1, groups=d_coor // 2) |
|||
|
|||
self.drop = DropPath(0.5) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
|
|||
h = input |
|||
x = self.layer_norm(input) |
|||
|
|||
qkv = self.qkv(x) # b, t, s, c-> b, t, s, 3*c |
|||
qkv = qkv.reshape(b, t, s, c, 3).permute(4, 0, 1, 2, 3) # 3,b,t,s,c |
|||
|
|||
# space group and time group |
|||
qkv_s, qkv_t = qkv.chunk(2, 4) # [3,b,t,s,c//2], [3,b,t,s,c//2] |
|||
|
|||
q_s, k_s, v_s = qkv_s[0], qkv_s[1], qkv_s[2] # b,t,s,c//2 |
|||
q_t, k_t, v_t = qkv_t[0], qkv_t[1], qkv_t[2] # b,t,s,c//2 |
|||
|
|||
# reshape for mat |
|||
q_s = rearrange(q_s, 'b t s (h c) -> (b h t) s c', h=self.head) # b,t,s,c//2-> b*h*t,s,c//2//h |
|||
k_s = rearrange(k_s, 'b t s (h c) -> (b h t) c s ', h=self.head) # b,t,s,c//2-> b*h*t,c//2//h,s |
|||
|
|||
q_t = rearrange(q_t, 'b t s (h c) -> (b h s) t c', h=self.head) # b,t,s,c//2 -> b*h*s,t,c//2//h |
|||
k_t = rearrange(k_t, 'b t s (h c) -> (b h s) c t ', h=self.head) # b,t,s,c//2-> b*h*s,c//2//h,t |
|||
|
|||
att_s = (q_s @ k_s) * self.scale # b*h*t,s,s |
|||
att_t = (q_t @ k_t) * self.scale # b*h*s,t,t |
|||
|
|||
att_s = att_s.softmax(-1) # b*h*t,s,s |
|||
att_t = att_t.softmax(-1) # b*h*s,t,t |
|||
|
|||
v_s = rearrange(v_s, 'b t s c -> b c t s ') |
|||
v_t = rearrange(v_t, 'b t s c -> b c t s ') |
|||
|
|||
# sep2 |
|||
sep2_s = self.sep2_s(v_s) # b,c//2,t,s |
|||
sep2_t = self.sep2_t(v_t) # b,c//2,t,s |
|||
sep2_s = rearrange(sep2_s, 'b (h c) t s -> (b h t) s c ', h=self.head) # b*h*t,s,c//2//h |
|||
sep2_t = rearrange(sep2_t, 'b (h c) t s -> (b h s) t c ', h=self.head) # b*h*s,t,c//2//h |
|||
|
|||
# sep1 |
|||
# v_s = rearrange(v_s, 'b c t s -> (b t ) s c') |
|||
# v_t = rearrange(v_t, 'b c t s -> (b s ) t c') |
|||
# print(lep_s.shape) |
|||
sep_s = self.emb(self.part).unsqueeze(0) # 1,s,c//2//h |
|||
sep_t = self.emb(self.part).unsqueeze(0).unsqueeze(0).unsqueeze(0) # 1,1,1,s,c//2//h |
|||
|
|||
# MSA |
|||
v_s = rearrange(v_s, 'b (h c) t s -> (b h t) s c ', h=self.head) # b*h*t,s,c//2//h |
|||
v_t = rearrange(v_t, 'b (h c) t s -> (b h s) t c ', h=self.head) # b*h*s,t,c//2//h |
|||
|
|||
x_s = att_s @ v_s + sep2_s + 0.0001 * self.drop(sep_s) # b*h*t,s,c//2//h |
|||
x_t = att_t @ v_t + sep2_t # b*h,t,c//h # b*h*s,t,c//2//h |
|||
|
|||
x_s = rearrange(x_s, '(b h t) s c -> b h t s c ', h=self.head, t=t) # b*h*t,s,c//h//2 -> b,h,t,s,c//h//2 |
|||
x_t = rearrange(x_t, '(b h s) t c -> b h t s c ', h=self.head, s=s) # b*h*s,t,c//h//2 -> b,h,t,s,c//h//2 |
|||
|
|||
x_t = x_t + 1e-9 * self.drop(sep_t) |
|||
|
|||
x = torch.cat((x_s, x_t), -1) # b,h,t,s,c//h |
|||
x = rearrange(x, 'b h t s c -> b t s (h c) ') # b,t,s,c |
|||
|
|||
# projection and skip-connection |
|||
x = self.proj(x) |
|||
x = x + h |
|||
return x |
|||
|
|||
|
|||
class STC_BLOCK(nn.Module): |
|||
def __init__(self, d_time, d_joint, d_coor): |
|||
super().__init__() |
|||
|
|||
self.layer_norm = nn.LayerNorm(d_coor) |
|||
|
|||
self.mlp = Mlp(d_coor, d_coor * 4, d_coor) |
|||
|
|||
self.stc_att = STC_ATTENTION(d_time, d_joint, d_coor) |
|||
self.drop = DropPath(0.0) |
|||
|
|||
def forward(self, input): |
|||
b, t, s, c = input.shape |
|||
x = self.stc_att(input) |
|||
x = x + self.drop(self.mlp(self.layer_norm(x))) |
|||
|
|||
return x |
|||
|
|||
|
|||
class STCFormer(nn.Module): |
|||
def __init__(self, num_block, d_time, d_joint, d_coor): |
|||
super(STCFormer, self).__init__() |
|||
|
|||
self.num_block = num_block |
|||
self.d_time = d_time |
|||
self.d_joint = d_joint |
|||
self.d_coor = d_coor |
|||
|
|||
self.stc_block = [] |
|||
for l in range(self.num_block): |
|||
self.stc_block.append(STC_BLOCK(self.d_time, self.d_joint, self.d_coor)) |
|||
self.stc_block = nn.ModuleList(self.stc_block) |
|||
|
|||
def forward(self, input): |
|||
# blocks layers |
|||
for i in range(self.num_block): |
|||
input = self.stc_block[i](input) |
|||
# exit() |
|||
return input |
|||
|
|||
|
|||
class Model(nn.Module): |
|||
def __init__(self, args): |
|||
super().__init__() |
|||
|
|||
layers, d_hid, frames = args.layers, args.d_hid, args.frames |
|||
num_joints_in, num_joints_out = args.n_joints, args.out_joints |
|||
|
|||
# layers, length, d_hid = layers, frames, d_hid |
|||
# num_joints_in, num_joints_out = 17,17 |
|||
|
|||
self.pose_emb = nn.Linear(2, d_hid, bias=False) |
|||
self.gelu = nn.GELU() |
|||
self.stcformer = STCFormer(layers, frames, num_joints_in, d_hid) |
|||
self.regress_head = nn.Linear(d_hid, 3, bias=False) |
|||
|
|||
def forward(self, x): |
|||
# b, t, s, c = x.shape #batch,frame,joint,coordinate |
|||
# dimension tranfer |
|||
x = self.pose_emb(x) |
|||
x = self.gelu(x) |
|||
# spatio-temporal correlation |
|||
x = self.stcformer(x) |
|||
# regression head |
|||
x = self.regress_head(x) |
|||
|
|||
return x |
|||
|
|||
class Args: |
|||
def __init__(self, layers, d_hid, frames, n_joints, out_joints): |
|||
self.layers = layers |
|||
self.d_hid = d_hid |
|||
self.frames = frames |
|||
self.n_joints = n_joints |
|||
self.out_joints = out_joints |
|||
|
|||
if __name__ == "__main__": |
|||
# inputs = torch.rand(64, 351, 34) # [btz, channel, T, H, W] |
|||
# inputs = torch.rand(1, 64, 4, 112, 112) #[btz, channel, T, H, W] |
|||
args = Args(layers=6, d_hid=256, frames=27, n_joints=17, out_joints=17) |
|||
net = Model(args) |
|||
inputs = torch.rand([1, 27, 17, 2]) |
|||
if torch.cuda.is_available(): |
|||
net = net.cuda() |
|||
inputs = inputs.cuda() |
|||
output = net(inputs) |
|||
print(output.size()) |
|||
|
|||
from thop import profile |
|||
# flops = 2*macs, 计算模型的计算量和参数量 |
|||
macs, params = profile(net, inputs=(inputs,)) |
|||
print(2*macs) |
|||
print(params) |
@ -0,0 +1,163 @@ |
|||
import torch |
|||
import torch.nn as nn |
|||
from model.block.vanilla_transformer_encoder_pretrain import Transformer, Transformer_dec |
|||
from model.block.strided_transformer_encoder import Transformer as Transformer_reduce |
|||
import numpy as np |
|||
|
|||
class LayerNorm(nn.Module): |
|||
def __init__(self, features, eps=1e-6): |
|||
super(LayerNorm, self).__init__() |
|||
self.a_2 = nn.Parameter(torch.ones(features)) |
|||
self.b_2 = nn.Parameter(torch.zeros(features)) |
|||
self.eps = eps |
|||
|
|||
def forward(self, x): |
|||
mean = x.mean(-1, keepdim=True) |
|||
std = x.std(-1, keepdim=True) |
|||
return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 |
|||
|
|||
class Linear(nn.Module): |
|||
def __init__(self, linear_size, p_dropout=0.25): |
|||
super(Linear, self).__init__() |
|||
self.l_size = linear_size |
|||
|
|||
self.relu = nn.LeakyReLU(0.2, inplace=True) |
|||
self.dropout = nn.Dropout(p_dropout) |
|||
|
|||
#self.w1 = nn.Linear(self.l_size, self.l_size) |
|||
self.w1 = nn.Conv1d(self.l_size, self.l_size, kernel_size=1) |
|||
self.batch_norm1 = nn.BatchNorm1d(self.l_size) |
|||
|
|||
#self.w2 = nn.Linear(self.l_size, self.l_size) |
|||
self.w2 = nn.Conv1d(self.l_size, self.l_size, kernel_size=1) |
|||
self.batch_norm2 = nn.BatchNorm1d(self.l_size) |
|||
|
|||
def forward(self, x): |
|||
y = self.w1(x) |
|||
y = self.batch_norm1(y) |
|||
y = self.relu(y) |
|||
y = self.dropout(y) |
|||
|
|||
y = self.w2(y) |
|||
y = self.batch_norm2(y) |
|||
y = self.relu(y) |
|||
y = self.dropout(y) |
|||
|
|||
out = x + y |
|||
|
|||
return out |
|||
|
|||
class FCBlock(nn.Module): |
|||
|
|||
def __init__(self, channel_in, channel_out, linear_size, block_num): |
|||
super(FCBlock, self).__init__() |
|||
|
|||
self.linear_size = linear_size |
|||
self.block_num = block_num |
|||
self.layers = [] |
|||
self.channel_in = channel_in |
|||
self.stage_num = 3 |
|||
self.p_dropout = 0.1 |
|||
#self.fc_1 = nn.Linear(self.channel_in, self.linear_size) |
|||
self.fc_1 = nn.Conv1d(self.channel_in, self.linear_size, kernel_size=1) |
|||
self.bn_1 = nn.BatchNorm1d(self.linear_size) |
|||
for i in range(block_num): |
|||
self.layers.append(Linear(self.linear_size, self.p_dropout)) |
|||
#self.fc_2 = nn.Linear(self.linear_size, channel_out) |
|||
self.fc_2 = nn.Conv1d(self.linear_size, channel_out, kernel_size=1) |
|||
|
|||
self.layers = nn.ModuleList(self.layers) |
|||
self.relu = nn.LeakyReLU(0.2, inplace=True) |
|||
self.dropout = nn.Dropout(self.p_dropout) |
|||
|
|||
def forward(self, x): |
|||
|
|||
x = self.fc_1(x) |
|||
x = self.bn_1(x) |
|||
x = self.relu(x) |
|||
x = self.dropout(x) |
|||
for i in range(self.block_num): |
|||
x = self.layers[i](x) |
|||
x = self.fc_2(x) |
|||
|
|||
return x |
|||
|
|||
class Model_MAE(nn.Module): |
|||
def __init__(self, args): |
|||
super().__init__() |
|||
|
|||
layers, channel, d_hid, length = args.layers, args.channel, args.d_hid, args.frames |
|||
stride_num = args.stride_num |
|||
self.spatial_mask_num = args.spatial_mask_num |
|||
self.num_joints_in, self.num_joints_out = args.n_joints, args.out_joints |
|||
|
|||
self.length = length |
|||
dec_dim_shrink = 2 |
|||
|
|||
self.encoder = FCBlock(2*self.num_joints_in, channel, 2*channel, 1) |
|||
|
|||
self.Transformer = Transformer(layers, channel, d_hid, length=length) |
|||
self.Transformer_dec = Transformer_dec(layers-1, channel//dec_dim_shrink, d_hid//dec_dim_shrink, length=length) |
|||
|
|||
self.encoder_to_decoder = nn.Linear(channel, channel//dec_dim_shrink, bias=False) |
|||
self.encoder_LN = LayerNorm(channel) |
|||
|
|||
self.fcn_dec = nn.Sequential( |
|||
nn.BatchNorm1d(channel//dec_dim_shrink, momentum=0.1), |
|||
nn.Conv1d(channel//dec_dim_shrink, 2*self.num_joints_out, kernel_size=1) |
|||
) |
|||
|
|||
# self.fcn_1 = nn.Sequential( |
|||
# nn.BatchNorm1d(channel, momentum=0.1), |
|||
# nn.Conv1d(channel, 3*self.num_joints_out, kernel_size=1) |
|||
# ) |
|||
|
|||
self.dec_pos_embedding = nn.Parameter(torch.randn(1, length, channel//dec_dim_shrink)) |
|||
self.mask_token = nn.Parameter(torch.randn(1, 1, channel//dec_dim_shrink)) |
|||
|
|||
self.spatial_mask_token = nn.Parameter(torch.randn(1, 1, 2)) |
|||
|
|||
def forward(self, x_in, mask, spatial_mask): |
|||
x_in = x_in[:, :, :, :, 0].permute(0, 2, 3, 1).contiguous() |
|||
b,f,_,_ = x_in.shape |
|||
|
|||
# spatial mask out |
|||
x = x_in.clone() |
|||
|
|||
x[:,spatial_mask] = self.spatial_mask_token.expand(b,self.spatial_mask_num*f,2) |
|||
|
|||
|
|||
x = x.view(b, f, -1) |
|||
|
|||
x = x.permute(0, 2, 1).contiguous() |
|||
|
|||
x = self.encoder(x) |
|||
|
|||
x = x.permute(0, 2, 1).contiguous() |
|||
feas = self.Transformer(x, mask_MAE=mask) |
|||
|
|||
feas = self.encoder_LN(feas) |
|||
feas = self.encoder_to_decoder(feas) |
|||
|
|||
B, N, C = feas.shape |
|||
|
|||
# we don't unshuffle the correct visible token order, |
|||
# but shuffle the pos embedding accorddingly. |
|||
expand_pos_embed = self.dec_pos_embedding.expand(B, -1, -1).clone() |
|||
pos_emd_vis = expand_pos_embed[:, ~mask].reshape(B, -1, C) |
|||
pos_emd_mask = expand_pos_embed[:, mask].reshape(B, -1, C) |
|||
x_full = torch.cat([feas + pos_emd_vis, self.mask_token + pos_emd_mask], dim=1) |
|||
|
|||
x_out = self.Transformer_dec(x_full, pos_emd_mask.shape[1]) |
|||
|
|||
x_out = x_out.permute(0, 2, 1).contiguous() |
|||
x_out = self.fcn_dec(x_out) |
|||
|
|||
x_out = x_out.view(b, self.num_joints_out, 2, -1) |
|||
x_out = x_out.permute(0, 2, 3, 1).contiguous().unsqueeze(dim=-1) |
|||
|
|||
return x_out |
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,419 @@ |
|||
import os |
|||
import glob |
|||
import torch |
|||
import random |
|||
import logging |
|||
import numpy as np |
|||
from tqdm import tqdm |
|||
import torch.nn as nn |
|||
import torch.utils.data |
|||
import torch.optim as optim |
|||
from common.opt import opts |
|||
from common.utils import * |
|||
from common.camera import get_uvd2xyz |
|||
from common.load_data_3dhp_mae import Fusion |
|||
from common.h36m_dataset import Human36mDataset |
|||
from model.block.refine import refine |
|||
from model.stc_pe_3dhp import Model |
|||
from model.stmo_pretrain import Model_MAE |
|||
|
|||
#from thop import clever_format |
|||
#from thop.profile import profile |
|||
import scipy.io as scio |
|||
|
|||
opt = opts().parse() |
|||
os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpu |
|||
|
|||
def train(opt, actions, train_loader, model, optimizer, epoch): |
|||
return step('train', opt, actions, train_loader, model, optimizer, epoch) |
|||
|
|||
def val(opt, actions, val_loader, model): |
|||
with torch.no_grad(): |
|||
return step('test', opt, actions, val_loader, model) |
|||
|
|||
def step(split, opt, actions, dataLoader, model, optimizer=None, epoch=None): |
|||
model_trans = model['trans'] |
|||
model_refine = model['refine'] |
|||
model_MAE = model['MAE'] |
|||
|
|||
if split == 'train': |
|||
model_trans.train() |
|||
model_refine.train() |
|||
model_MAE.train() |
|||
else: |
|||
model_trans.eval() |
|||
model_refine.eval() |
|||
model_MAE.eval() |
|||
|
|||
loss_all = {'loss': AccumLoss()} |
|||
error_sum = AccumLoss() |
|||
error_sum_test = AccumLoss() |
|||
|
|||
action_error_sum = define_error_list(actions) |
|||
action_error_sum_post_out = define_error_list(actions) |
|||
action_error_sum_MAE = define_error_list(actions) |
|||
|
|||
joints_left = [5, 6, 7, 11, 12, 13] |
|||
joints_right = [2, 3, 4, 8, 9, 10] |
|||
|
|||
data_inference = {} |
|||
|
|||
for i, data in enumerate(tqdm(dataLoader, 0)): |
|||
|
|||
if opt.MAE: |
|||
#batch_cam, input_2D, seq, subject, scale, bb_box, cam_ind = data |
|||
if split == "train": |
|||
batch_cam, input_2D, seq, subject, scale, bb_box, cam_ind = data |
|||
else: |
|||
batch_cam, input_2D, seq, scale, bb_box = data |
|||
[input_2D, batch_cam, scale, bb_box] = get_varialbe(split,[input_2D, batch_cam, scale, bb_box]) |
|||
|
|||
N = input_2D.size(0) |
|||
f = opt.frames |
|||
|
|||
mask_num = int(f*opt.temporal_mask_rate) |
|||
mask = np.hstack([ |
|||
np.zeros(f - mask_num), |
|||
np.ones(mask_num), |
|||
]).flatten() |
|||
|
|||
np.random.seed() |
|||
np.random.shuffle(mask) |
|||
|
|||
mask = torch.from_numpy(mask).to(torch.bool).cuda() |
|||
|
|||
spatial_mask = np.zeros((f, 17), dtype=bool) |
|||
for k in range(f): |
|||
ran = random.sample(range(0, 16), opt.spatial_mask_num) |
|||
spatial_mask[k, ran] = True |
|||
|
|||
|
|||
if opt.test_augmentation and split == 'test': |
|||
input_2D, output_2D = input_augmentation_MAE(input_2D, model_MAE, joints_left, joints_right, mask, spatial_mask) |
|||
|
|||
else: |
|||
input_2D = input_2D.view(N, -1, opt.n_joints, opt.in_channels, 1).permute(0, 3, 1, 2, 4).type( |
|||
torch.cuda.FloatTensor) |
|||
output_2D = model_MAE(input_2D, mask, spatial_mask) |
|||
|
|||
|
|||
input_2D = input_2D.permute(0, 2, 3, 1, 4).view(N, -1, opt.n_joints, 2) |
|||
output_2D = output_2D.permute(0, 2, 3, 1, 4).view(N, -1, opt.n_joints, 2) |
|||
|
|||
loss = mpjpe_cal(output_2D, torch.cat((input_2D[:, ~mask], input_2D[:, mask]), dim=1)) |
|||
|
|||
|
|||
else: |
|||
#batch_cam, gt_3D, input_2D, action, subject, scale, bb_box, cam_ind = data |
|||
|
|||
if split == "train": |
|||
batch_cam, gt_3D, input_2D, seq, subject, scale, bb_box, cam_ind = data |
|||
else: |
|||
batch_cam, gt_3D, input_2D, seq, scale, bb_box = data |
|||
|
|||
[input_2D, gt_3D, batch_cam, scale, bb_box] = get_varialbe(split, |
|||
[input_2D, gt_3D, batch_cam, scale, bb_box]) |
|||
|
|||
N = input_2D.size(0) |
|||
|
|||
out_target = gt_3D.clone().view(N, -1, opt.out_joints, opt.out_channels) |
|||
out_target[:, :, 14] = 0 |
|||
gt_3D = gt_3D.view(N, -1, opt.out_joints, opt.out_channels).type(torch.cuda.FloatTensor) |
|||
|
|||
if out_target.size(1) > 1: |
|||
out_target_single = out_target[:, opt.pad].unsqueeze(1) |
|||
gt_3D_single = gt_3D[:, opt.pad].unsqueeze(1) |
|||
else: |
|||
out_target_single = out_target |
|||
gt_3D_single = gt_3D |
|||
|
|||
if opt.test_augmentation and split =='test': |
|||
input_2D, output_3D = input_augmentation(input_2D, model_trans, joints_left, joints_right) |
|||
else: |
|||
input_2D = input_2D.view(N, -1, opt.n_joints, opt.in_channels, 1).permute(0, 3, 1, 2, 4).type(torch.cuda.FloatTensor) |
|||
output_3D = model_trans(input_2D) |
|||
|
|||
# output_3D_VTE = output_3D_VTE.permute(0, 2, 3, 4, 1).contiguous().view(N, -1, opt.out_joints, opt.out_channels) |
|||
# output_3D = output_3D.permute(0, 2, 3, 4, 1).contiguous().view(N, -1, opt.out_joints, opt.out_channels) |
|||
|
|||
# output_3D_VTE = output_3D_VTE * scale.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).repeat(1, output_3D_VTE.size(1),opt.out_joints, opt.out_channels) |
|||
output_3D = output_3D * scale.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).repeat(1, output_3D.size(1),opt.out_joints, opt.out_channels) |
|||
output_3D_single = output_3D[:,opt.pad].unsqueeze(1) |
|||
|
|||
if split == 'train': |
|||
# out_target = out_target[:, opt.pad].unsqueeze(1) |
|||
pred_out = output_3D |
|||
|
|||
elif split == 'test': |
|||
pred_out = output_3D_single |
|||
|
|||
input_2D = input_2D.permute(0, 2, 3, 1, 4).view(N, -1, opt.n_joints ,2) |
|||
|
|||
if opt.refine: |
|||
pred_uv = input_2D |
|||
uvd = torch.cat((pred_uv[:, opt.pad, :, :].unsqueeze(1), output_3D_single[:, :, :, 2].unsqueeze(-1)), -1) |
|||
xyz = get_uvd2xyz(uvd, gt_3D_single, batch_cam) |
|||
xyz[:, :, 0, :] = 0 |
|||
post_out = model_refine(output_3D_single, xyz) |
|||
loss = mpjpe_cal(post_out, out_target_single) |
|||
else: |
|||
# print(pred_out.shape) |
|||
# print(out_target.shape) |
|||
loss = mpjpe_cal(pred_out, out_target) |
|||
|
|||
loss_all['loss'].update(loss.detach().cpu().numpy() * N, N) |
|||
|
|||
if split == 'train': |
|||
optimizer.zero_grad() |
|||
loss.backward() |
|||
optimizer.step() |
|||
|
|||
if not opt.MAE: |
|||
|
|||
if opt.refine: |
|||
post_out[:,:,14,:] = 0 |
|||
joint_error = mpjpe_cal(post_out, out_target_single).item() |
|||
else: |
|||
pred_out[:,:,14,:] = 0 |
|||
joint_error = mpjpe_cal(pred_out, out_target).item() |
|||
|
|||
error_sum.update(joint_error*N, N) |
|||
|
|||
elif split == 'test': |
|||
if opt.MAE: |
|||
# action_error_sum_MAE = test_calculation(output_2D, torch.cat((input_2D[:, ~mask], input_2D[:, mask]), dim=1), action, action_error_sum_MAE, opt.dataset, |
|||
# subject,MAE=opt.MAE) |
|||
joint_error_test = mpjpe_cal(torch.cat((input_2D[:, ~mask], input_2D[:, mask]), dim=1), output_2D).item() |
|||
else: |
|||
pred_out[:, :, 14, :] = 0 |
|||
#action_error_sum = test_calculation(pred_out, out_target, action, action_error_sum, opt.dataset, subject) |
|||
joint_error_test = mpjpe_cal(pred_out, out_target).item() |
|||
out = pred_out |
|||
# if opt.refine: |
|||
# post_out[:, :, 14, :] = 0 |
|||
# action_error_sum_post_out = test_calculation(post_out, out_target, action, action_error_sum_post_out, opt.dataset, subject) |
|||
|
|||
if opt.train == 0: |
|||
for seq_cnt in range(len(seq)): |
|||
seq_name = seq[seq_cnt] |
|||
if seq_name in data_inference: |
|||
data_inference[seq_name] = np.concatenate( |
|||
(data_inference[seq_name], out[seq_cnt].permute(2, 1, 0).cpu().numpy()), axis=2) |
|||
else: |
|||
data_inference[seq_name] = out[seq_cnt].permute(2, 1, 0).cpu().numpy() |
|||
|
|||
error_sum_test.update(joint_error_test * N, N) |
|||
|
|||
if split == 'train': |
|||
if opt.MAE: |
|||
return loss_all['loss'].avg*1000 |
|||
else: |
|||
return loss_all['loss'].avg, error_sum.avg |
|||
elif split == 'test': |
|||
if opt.MAE: |
|||
#p1, p2 = print_error(opt.dataset, action_error_sum_MAE, opt.train) |
|||
return error_sum_test.avg*1000 |
|||
if opt.refine: |
|||
p1, p2 = print_error(opt.dataset, action_error_sum_post_out, opt.train) |
|||
else: |
|||
#p1, p2 = print_error(opt.dataset, action_error_sum, opt.train) |
|||
if opt.train == 0: |
|||
for seq_name in data_inference.keys(): |
|||
data_inference[seq_name] = data_inference[seq_name][:, :, None, :] |
|||
mat_path = os.path.join(opt.checkpoint, 'inference_data_81_3dhp.mat') |
|||
scio.savemat(mat_path, data_inference) |
|||
|
|||
return error_sum_test.avg |
|||
|
|||
def input_augmentation_MAE(input_2D, model_trans, joints_left, joints_right, mask, spatial_mask=None): |
|||
N, _, T, J, C = input_2D.shape |
|||
|
|||
input_2D_flip = input_2D[:, 1].view(N, T, J, C, 1).permute(0, 3, 1, 2, 4) |
|||
input_2D_non_flip = input_2D[:, 0].view(N, T, J, C, 1).permute(0, 3, 1, 2, 4) |
|||
|
|||
output_2D_flip = model_trans(input_2D_flip, mask, spatial_mask) |
|||
|
|||
output_2D_flip[:,:,:, 0] *= -1 |
|||
|
|||
output_2D_flip[:, :, :, joints_left + joints_right] = output_2D_flip[:, :, :, joints_right + joints_left] |
|||
|
|||
output_2D_non_flip = model_trans(input_2D_non_flip, mask, spatial_mask) |
|||
|
|||
output_2D = (output_2D_non_flip + output_2D_flip) / 2 |
|||
|
|||
input_2D = input_2D_non_flip |
|||
|
|||
return input_2D, output_2D |
|||
|
|||
def input_augmentation(input_2D, model_trans, joints_left, joints_right): |
|||
N, _, T, J, C = input_2D.shape |
|||
|
|||
input_2D_flip = input_2D[:, 1].view(N, T, J, C, 1).permute(0, 3, 1, 2, 4) |
|||
input_2D_non_flip = input_2D[:, 0].view(N, T, J, C, 1).permute(0, 3, 1, 2, 4) |
|||
|
|||
output_3D_flip = model_trans(input_2D_flip) |
|||
|
|||
# output_3D_flip_VTE[:, 0] *= -1 |
|||
output_3D_flip[:,:,:, 0] *= -1 |
|||
|
|||
# output_3D_flip_VTE[:, :, :, joints_left + joints_right] = output_3D_flip_VTE[:, :, :, joints_right + joints_left] |
|||
output_3D_flip[:, :, joints_left + joints_right] = output_3D_flip[:, :, joints_right + joints_left] |
|||
|
|||
output_3D_non_flip = model_trans(input_2D_non_flip) |
|||
|
|||
# output_3D_VTE = (output_3D_non_flip_VTE + output_3D_flip_VTE) / 2 |
|||
output_3D = (output_3D_non_flip + output_3D_flip) / 2 |
|||
|
|||
input_2D = input_2D_non_flip |
|||
|
|||
return input_2D, output_3D |
|||
|
|||
if __name__ == '__main__': |
|||
os.environ["CUDA_VISIBLE_DEVICES"] = "0" |
|||
opt.manualSeed = 1 |
|||
|
|||
random.seed(opt.manualSeed) |
|||
torch.manual_seed(opt.manualSeed) |
|||
np.random.seed(opt.manualSeed) |
|||
torch.cuda.manual_seed_all(opt.manualSeed) |
|||
|
|||
torch.backends.cudnn.benchmark = False |
|||
torch.backends.cudnn.deterministic = True |
|||
|
|||
if opt.train == 1: |
|||
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%Y/%m/%d %H:%M:%S', \ |
|||
filename=os.path.join(opt.checkpoint, 'train.log'), level=logging.INFO) |
|||
|
|||
root_path = opt.root_path |
|||
dataset_path = root_path + 'data_3d_' + opt.dataset + '.npz' |
|||
|
|||
#dataset = Human36mDataset(dataset_path, opt) |
|||
actions = define_actions(opt.actions) |
|||
|
|||
if opt.train: |
|||
#train_data = Fusion(opt=opt, train=True, root_path=root_path) |
|||
train_data = Fusion(opt=opt, train=True, root_path=root_path, MAE=opt.MAE) |
|||
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=opt.batchSize, |
|||
shuffle=True, num_workers=int(opt.workers), pin_memory=True) |
|||
if opt.test: |
|||
#test_data = Fusion(opt=opt, train=False,root_path =root_path) |
|||
test_data = Fusion(opt=opt, train=False, root_path=root_path, MAE=opt.MAE) |
|||
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=opt.batchSize, |
|||
shuffle=False, num_workers=int(opt.workers), pin_memory=True) |
|||
|
|||
opt.out_joints = 17 |
|||
|
|||
model = {} |
|||
model['trans'] = nn.DataParallel(Model(opt)).cuda() |
|||
model['refine'] = nn.DataParallel(refine(opt)).cuda() |
|||
model['MAE'] = nn.DataParallel(Model_MAE(opt)).cuda() |
|||
|
|||
model_params = 0 |
|||
for parameter in model['trans'].parameters(): |
|||
model_params += parameter.numel() |
|||
print('INFO: Trainable parameter count:', model_params) |
|||
|
|||
|
|||
#if opt.MAE_test_reload==1: |
|||
# model_dict = model['MAE'].state_dict() |
|||
|
|||
# MAE_test_path = opt.previous_dir |
|||
|
|||
# pre_dict_MAE = torch.load(MAE_test_path) |
|||
# for name, key in model_dict.items(): |
|||
# model_dict[name] = pre_dict_MAE[name] |
|||
# model['MAE'].load_state_dict(model_dict) |
|||
|
|||
if opt.MAE_reload == 1: |
|||
model_dict = model['trans'].state_dict() |
|||
|
|||
MAE_path = opt.previous_dir |
|||
|
|||
pre_dict = torch.load(MAE_path) |
|||
|
|||
state_dict = {k: v for k, v in pre_dict.items() if k in model_dict.keys()} |
|||
|
|||
model_dict.update(state_dict) |
|||
model['trans'].load_state_dict(model_dict) |
|||
|
|||
|
|||
model_dict = model['trans'].state_dict() |
|||
if opt.reload == 1: |
|||
|
|||
no_refine_path = opt.previous_dir |
|||
|
|||
pre_dict = torch.load(no_refine_path) |
|||
for name, key in model_dict.items(): |
|||
model_dict[name] = pre_dict[name] |
|||
model['trans'].load_state_dict(model_dict) |
|||
|
|||
refine_dict = model['refine'].state_dict() |
|||
if opt.refine_reload == 1: |
|||
|
|||
refine_path = opt.previous_refine_name |
|||
|
|||
pre_dict_refine = torch.load(refine_path) |
|||
for name, key in refine_dict.items(): |
|||
refine_dict[name] = pre_dict_refine[name] |
|||
model['refine'].load_state_dict(refine_dict) |
|||
|
|||
all_param = [] |
|||
lr = opt.lr |
|||
for i_model in model: |
|||
all_param += list(model[i_model].parameters()) |
|||
optimizer_all = optim.Adam(all_param, lr=opt.lr, amsgrad=True) |
|||
|
|||
for epoch in range(1, opt.nepoch): |
|||
if opt.train == 1: |
|||
if not opt.MAE: |
|||
loss, mpjpe = train(opt, actions, train_dataloader, model, optimizer_all, epoch) |
|||
else: |
|||
loss = train(opt, actions, train_dataloader, model, optimizer_all, epoch) |
|||
if opt.test == 1: |
|||
if not opt.MAE: |
|||
p1 = val(opt, actions, test_dataloader, model) |
|||
else: |
|||
p1 = val(opt, actions, test_dataloader, model) |
|||
data_threshold = p1 |
|||
|
|||
if opt.train and data_threshold < opt.previous_best_threshold: |
|||
if opt.MAE: |
|||
opt.previous_name = save_model(opt.previous_name, opt.checkpoint, epoch, data_threshold, |
|||
model['MAE'], 'MAE') |
|||
|
|||
else: |
|||
opt.previous_name = save_model(opt.previous_name, opt.checkpoint, epoch, data_threshold, model['trans'], 'no_refine') |
|||
|
|||
if opt.refine: |
|||
opt.previous_refine_name = save_model(opt.previous_refine_name, opt.checkpoint, epoch, |
|||
data_threshold, model['refine'], 'refine') |
|||
opt.previous_best_threshold = data_threshold |
|||
|
|||
if opt.train == 0: |
|||
print('p1: %.2f' % (p1)) |
|||
break |
|||
else: |
|||
if opt.MAE: |
|||
logging.info('epoch: %d, lr: %.7f, loss: %.4f, p1: %.2f' % ( |
|||
epoch, lr, loss, p1)) |
|||
print('e: %d, lr: %.7f, loss: %.4f, p1: %.2f' % (epoch, lr, loss, p1)) |
|||
else: |
|||
logging.info('epoch: %d, lr: %.7f, loss: %.4f, MPJPE: %.2f, p1: %.2f' % (epoch, lr, loss, mpjpe, p1)) |
|||
print('e: %d, lr: %.7f, loss: %.4f, M: %.2f, p1: %.2f' % (epoch, lr, loss, mpjpe, p1)) |
|||
|
|||
if epoch % opt.large_decay_epoch == 0: |
|||
for param_group in optimizer_all.param_groups: |
|||
param_group['lr'] *= opt.lr_decay_large |
|||
lr *= opt.lr_decay_large |
|||
else: |
|||
for param_group in optimizer_all.param_groups: |
|||
param_group['lr'] *= opt.lr_decay |
|||
lr *= opt.lr_decay |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,250 @@ |
|||
import os |
|||
import glob |
|||
import torch |
|||
import random |
|||
import logging |
|||
import numpy as np |
|||
from tqdm import tqdm |
|||
import torch.nn as nn |
|||
import torch.utils.data |
|||
import torch.optim as optim |
|||
import scipy.io as scio |
|||
from common.opt import opts |
|||
from common.utils import * |
|||
from common.camera import get_uvd2xyz |
|||
from common.load_data_hm36_tds import Fusion |
|||
from common.h36m_dataset import Human36mDataset |
|||
from model.block.refine import refine |
|||
from model.hpformer_3 import Model |
|||
from torch.cuda.amp import autocast as autocast |
|||
|
|||
import time |
|||
|
|||
opt = opts().parse() |
|||
os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpu |
|||
|
|||
def train(opt, actions, train_loader, model, optimizer, epoch): |
|||
return step('train', opt, actions, train_loader, model, optimizer, epoch) |
|||
|
|||
def val(opt, actions, val_loader, model): |
|||
with torch.no_grad(): |
|||
return step('test', opt, actions, val_loader, model) |
|||
|
|||
def step(split, opt, actions, dataLoader, model, optimizer=None, epoch=None): |
|||
model_trans = model['trans'] |
|||
model_refine = model['refine'] |
|||
|
|||
if split == 'train': |
|||
model_trans.train() |
|||
model_refine.train() |
|||
else: |
|||
model_trans.eval() |
|||
model_refine.eval() |
|||
|
|||
loss_all = {'loss': AccumLoss()} |
|||
action_error_sum = define_error_list(actions) |
|||
action_error_sum_refine = define_error_list(actions) |
|||
|
|||
if split == 'train': |
|||
print(f'amp:{opt.amp}') |
|||
if opt.amp: |
|||
scaler = torch.cuda.amp.GradScaler() |
|||
|
|||
for i, data in enumerate(tqdm(dataLoader, 0)): |
|||
#if i ==5: |
|||
# break |
|||
batch_cam, gt_3D, input_2D, action, subject, scale, bb_box, cam_ind = data |
|||
[input_2D, gt_3D, batch_cam, scale, bb_box] = get_varialbe(split, [input_2D, gt_3D, batch_cam, scale, bb_box]) |
|||
|
|||
if split =='train': |
|||
#start = time.time() |
|||
if opt.amp: |
|||
with autocast(): |
|||
output_3D = model_trans(input_2D) |
|||
#end = time.time() |
|||
#print(input_2D.shape) |
|||
#print(output_3D.shape) |
|||
#print(gt_3D.shape) |
|||
#print(end-start) |
|||
#exit() |
|||
else: |
|||
output_3D = model_trans(input_2D) |
|||
else: |
|||
input_2D, output_3D = input_augmentation(input_2D, model_trans) |
|||
#print(input_2D.shape) |
|||
|
|||
out_target = gt_3D.clone() |
|||
out_target[:, :, 0] = 0 |
|||
output_3D_single = output_3D[:,opt.pad].unsqueeze(1) |
|||
|
|||
if out_target.size(1) > 1: |
|||
out_target_single = out_target[:, opt.pad].unsqueeze(1) |
|||
gt_3D_single = gt_3D[:, opt.pad].unsqueeze(1) |
|||
else: |
|||
out_target_single = out_target |
|||
gt_3D_single = gt_3D |
|||
|
|||
if opt.refine: |
|||
pred_uv = input_2D[:, opt.pad, :, :].unsqueeze(1) |
|||
uvd = torch.cat((pred_uv, output_3D_single[:, :, :, 2].unsqueeze(-1)), -1) |
|||
xyz = get_uvd2xyz(uvd, gt_3D_single, batch_cam) |
|||
xyz[:, :, 0, :] = 0 |
|||
post_out = model_refine(output_3D_single, xyz) |
|||
|
|||
if split == 'train': |
|||
if opt.amp: |
|||
with autocast(): |
|||
if opt.refine: |
|||
loss = mpjpe_cal(post_out, out_target_single) |
|||
else: |
|||
loss = mpjpe_cal(output_3D, out_target) |
|||
else: |
|||
if opt.refine: |
|||
loss = mpjpe_cal(post_out, out_target_single) |
|||
else: |
|||
loss = mpjpe_cal(output_3D, out_target) |
|||
|
|||
N = input_2D.size(0) |
|||
loss_all['loss'].update(loss.detach().cpu().numpy() * N, N) |
|||
|
|||
optimizer.zero_grad() |
|||
if opt.amp: |
|||
scaler.scale(loss).backward() |
|||
scaler.step(optimizer) |
|||
scaler.update() |
|||
else: |
|||
loss.backward() |
|||
optimizer.step() |
|||
|
|||
elif split == 'test': |
|||
output_3D[:, :, 0, :] = 0 |
|||
action_error_sum = test_calculation(output_3D_single, out_target, action, action_error_sum, opt.dataset, subject) |
|||
|
|||
if opt.refine: |
|||
output_3D[:, :, 0, :] = 0 |
|||
action_error_sum_refine = test_calculation(output_3D_single, out_target, action, action_error_sum_refine, opt.dataset, subject) |
|||
|
|||
if split == 'train': |
|||
return loss_all['loss'].avg |
|||
elif split == 'test': |
|||
if opt.refine: |
|||
p1, p2 = print_error(opt.dataset, action_error_sum_refine, opt.train) |
|||
else: |
|||
p1, p2 = print_error(opt.dataset, action_error_sum, opt.train) |
|||
return p1, p2 |
|||
|
|||
def input_augmentation(input_2D, model_trans): |
|||
joints_left = [4, 5, 6, 11, 12, 13] |
|||
joints_right = [1, 2, 3, 14, 15, 16] |
|||
|
|||
input_2D_non_flip = input_2D[:, 0] |
|||
input_2D_flip = input_2D[:, 1] |
|||
|
|||
output_3D_non_flip = model_trans(input_2D_non_flip) |
|||
output_3D_flip = model_trans(input_2D_flip) |
|||
|
|||
output_3D_flip[:, :, :, 0] *= -1 |
|||
|
|||
|
|||
output_3D_flip[:, :, joints_left + joints_right, :] = output_3D_flip[:, :, joints_right + joints_left, :] |
|||
|
|||
|
|||
|
|||
output_3D = (output_3D_non_flip + output_3D_flip) / 2 |
|||
input_2D = input_2D_non_flip |
|||
|
|||
return input_2D, output_3D |
|||
|
|||
if __name__ == '__main__': |
|||
os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpu |
|||
opt.manualSeed = 42 |
|||
|
|||
random.seed(opt.manualSeed) |
|||
torch.manual_seed(opt.manualSeed) |
|||
|
|||
if opt.train: |
|||
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%Y/%m/%d %H:%M:%S', \ |
|||
filename=os.path.join(opt.checkpoint, 'train.log'), level=logging.INFO) |
|||
|
|||
root_path = opt.root_path |
|||
dataset_path = root_path + 'data_3d_' + opt.dataset + '.npz' |
|||
|
|||
dataset = Human36mDataset(dataset_path, opt) |
|||
actions = define_actions(opt.actions) |
|||
|
|||
if opt.train: |
|||
train_data = Fusion(opt=opt, train=True, dataset=dataset, root_path=root_path, tds=opt.t_downsample) |
|||
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=opt.batchSize//opt.stride, |
|||
shuffle=True, num_workers=int(opt.workers), pin_memory=True) |
|||
|
|||
test_data = Fusion(opt=opt, train=False,dataset=dataset, root_path =root_path, tds=opt.t_downsample) |
|||
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=opt.batchSize//opt.stride, |
|||
shuffle=False, num_workers=int(opt.workers), pin_memory=True) |
|||
|
|||
opt.out_joints = dataset.skeleton().num_joints() |
|||
|
|||
model = {} |
|||
model['trans'] = Model(opt).cuda() |
|||
model['refine']= refine(opt).cuda() |
|||
|
|||
model_dict = model['trans'].state_dict() |
|||
if opt.reload: |
|||
|
|||
no_refine_path = opt.previous_dir |
|||
pre_dict = torch.load(no_refine_path) |
|||
for name, key in model_dict.items(): |
|||
model_dict[name] = pre_dict[name] |
|||
model['trans'].load_state_dict(model_dict) |
|||
|
|||
refine_dict = model['refine'].state_dict() |
|||
if opt.refine_reload: |
|||
refine_path = opt.previous_refine_name |
|||
|
|||
pre_dict_refine = torch.load(refine_path) |
|||
for name, key in refine_dict.items(): |
|||
refine_dict[name] = pre_dict_refine[name] |
|||
model['refine'].load_state_dict(refine_dict) |
|||
|
|||
all_param = [] |
|||
lr = opt.lr |
|||
for i_model in model: |
|||
all_param += list(model[i_model].parameters()) |
|||
optimizer_all = optim.Adam(all_param, lr=opt.lr, amsgrad=True) |
|||
|
|||
for epoch in range(1, opt.nepoch): |
|||
if opt.train: |
|||
loss = train(opt, actions, train_dataloader, model, optimizer_all, epoch) |
|||
|
|||
p1, p2 = val(opt, actions, test_dataloader, model) |
|||
|
|||
# if opt.train and not opt.refine: |
|||
# save_model_epoch(opt.checkpoint, epoch, model['trans']) |
|||
|
|||
if opt.train and p1 < opt.previous_best_threshold: |
|||
opt.previous_name = save_model(opt.previous_name, opt.checkpoint, epoch, p1, model['trans'], 'no_refine') |
|||
|
|||
if opt.refine: |
|||
opt.previous_refine_name = save_model(opt.previous_refine_name, opt.checkpoint, epoch, |
|||
p1, model['refine'], 'refine') |
|||
opt.previous_best_threshold = p1 |
|||
|
|||
if not opt.train: |
|||
print('p1: %.2f, p2: %.2f' % (p1, p2)) |
|||
break |
|||
else: |
|||
logging.info('epoch: %d, lr: %.7f, loss: %.4f, p1: %.2f, p2: %.2f' % (epoch, lr, loss, p1, p2)) |
|||
print('e: %d, lr: %.7f, loss: %.4f, p1: %.2f, p2: %.2f' % (epoch, lr, loss, p1, p2)) |
|||
|
|||
if epoch % opt.large_decay_epoch == 0: |
|||
for param_group in optimizer_all.param_groups: |
|||
param_group['lr'] *= opt.lr_decay_large |
|||
lr *= opt.lr_decay_large |
|||
else: |
|||
for param_group in optimizer_all.param_groups: |
|||
param_group['lr'] *= opt.lr_decay |
|||
lr *= opt.lr_decay |
|||
|
|||
|
|||
|
|||
|
|||
|
Loading…
Reference in new issue