问题
I know that BERT has total vocabulary size of 30522 which contains some words and subwords. I want to get the initial input embeddings of BERT. So, my requirement is to get the table of size [30522, 768]
to which I can index by token id to get its embeddings. Where can I get this table?
回答1:
The BertModels have get_input_embeddings():
import torch
from transformers import BertModel, BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')
token_embedding = {token: bert.get_input_embeddings()(torch.tensor(id)) for token, id in tokenizer.get_vocab().items()}
print(len(token_embedding))
print(token_embedding['[CLS]'])
Output:
30522
tensor([ 1.3630e-02, -2.6490e-02, -2.3503e-02, -7.7876e-03, 8.5892e-03,
-7.6645e-03, -9.8808e-03, 6.0184e-03, 4.6921e-03, -3.0984e-02,
1.8883e-02, -6.0093e-03, -1.6652e-02, 1.1684e-02, -3.6245e-02,
8.3482e-03, -1.2112e-03, 1.0322e-02, 1.6692e-02, -3.0354e-02,
-1.2372e-02, -2.5173e-02, -8.9602e-03, 8.1994e-03, -2.0011e-02,
-1.5901e-02, -3.8394e-03, 1.4241e-03, 7.0500e-03, 1.6092e-03,
-2.7764e-03, 9.4931e-03, -2.2768e-02, 1.9317e-02, -1.3442e-02,
-2.3763e-02, -1.4617e-02, 9.7735e-03, -2.2428e-03, 3.0642e-02,
6.7829e-03, -2.6471e-03, -1.8553e-02, -1.2363e-02, 7.6489e-03,
-2.5461e-03, -3.1498e-01, 6.3761e-03, 4.8914e-02, -7.7636e-03,
6.0919e-02, 2.1346e-02, -3.9741e-02, 2.2853e-01, 2.6502e-02,
-1.0144e-03, -7.8480e-03, -1.9995e-03, 1.7057e-02, -3.3270e-02,
4.5421e-03, 6.1751e-03, -1.0077e-01, -2.0973e-02, -1.4512e-04,
-9.6657e-03, 1.0871e-02, -1.4786e-02, 2.6437e-04, 2.1166e-02,
1.6492e-02, -5.1928e-03, -1.1857e-02, -9.9159e-03, -1.4363e-02,
-1.2405e-02, -1.2973e-02, 2.6778e-02, -1.0986e-02, 1.0572e-02,
-2.5566e-02, 5.2494e-03, 1.5890e-02, -5.1504e-03, -7.5859e-03,
2.0259e-02, -7.0155e-03, 1.6359e-02, 1.7487e-02, 5.4297e-03,
-8.6403e-03, 2.8821e-02, -7.8964e-03, 1.9259e-02, 2.3868e-02,
-4.3472e-03, 5.5662e-02, -2.1940e-02, 4.1779e-03, -5.7216e-03,
2.6712e-02, -5.0371e-03, 2.4923e-02, -1.3429e-02, -8.4337e-03,
9.8188e-02, -1.2940e-03, 1.2865e-02, -1.5930e-03, 3.6437e-03,
1.5569e-02, 1.8620e-02, -9.0643e-03, -1.9740e-02, 1.0530e-02,
-2.7359e-03, -7.5283e-03, 1.1492e-03, 2.6162e-03, -6.2757e-03,
-8.6096e-03, 6.6221e-01, -3.2235e-03, -4.1309e-02, 3.3047e-03,
-2.5040e-03, 1.2838e-04, -6.8073e-03, 6.0291e-03, -9.8468e-03,
8.0641e-03, -1.9815e-03, 2.5801e-02, 5.7429e-03, -1.0712e-02,
2.9176e-02, 5.9414e-03, 2.4795e-02, -1.7887e-02, 7.3183e-01,
1.0964e-02, 5.9942e-03, -4.6157e-02, 4.0131e-02, -9.7481e-03,
-8.9496e-01, 1.6385e-02, -1.9816e-03, 1.4691e-02, -1.9837e-02,
-1.7611e-02, -4.5263e-04, -1.8605e-02, -1.5660e-02, -1.0709e-02,
1.8016e-02, -3.4149e-03, -1.2632e-02, 4.2877e-03, -3.9169e-01,
1.0016e-02, -1.0955e-02, 4.5133e-03, -5.1150e-03, 4.9968e-03,
1.7852e-02, 1.1313e-02, 2.6519e-03, 3.3658e-01, -1.8168e-02,
1.3170e-02, 7.3927e-03, 5.2521e-03, -9.6230e-03, 1.2844e-02,
4.1554e-01, -9.7247e-03, -4.2439e-03, 5.5287e-04, 1.8271e-02,
-1.3889e-03, -2.0502e-03, -8.1946e-03, -6.5979e-06, -7.2764e-04,
-1.4625e-03, -6.9872e-03, -6.9633e-03, -8.0701e-03, 1.9936e-02,
4.8370e-03, 8.6883e-03, -4.9246e-02, -2.0028e-02, 1.4124e-03,
1.0444e-02, -1.1236e-02, -4.4654e-03, -2.0491e-02, -2.7654e-02,
-3.7079e-02, 1.3215e-02, 6.9498e-02, -3.1109e-02, 7.0562e-03,
1.0887e-02, -7.8090e-03, -1.0501e-02, -4.8735e-03, -6.8399e-04,
1.4717e-02, 4.4342e-03, 1.6012e-02, -1.0427e-02, -2.5767e-02,
-2.2699e-01, 8.6569e-02, 2.3453e-02, 4.6362e-02, 3.5609e-03,
2.1353e-02, 2.3703e-02, -2.0252e-02, 2.1580e-02, 7.2652e-03,
2.0933e-01, 1.2108e-02, 1.0869e-02, 7.0568e-03, -3.1132e-02,
2.0505e-02, 3.2248e-03, -2.2724e-03, 5.5342e-03, 3.0563e-03,
1.9542e-02, 1.2827e-03, 1.5952e-02, -1.5458e-02, -3.8455e-03,
-4.9417e-03, -1.0446e-02, 7.0516e-03, 2.2467e-03, -9.3643e-03,
1.9163e-02, 1.4239e-02, -1.5816e-02, 8.7413e-03, 2.4737e-02,
-7.3777e-03, -4.0975e-02, 9.4948e-03, 1.4700e-02, 2.6819e-02,
1.0706e-02, 1.0621e-02, -7.1816e-03, -8.5402e-03, 1.2261e-02,
-4.8679e-03, -9.6136e-03, 7.8765e-04, 3.8504e-02, -7.7485e-03,
-6.5018e-03, 3.4352e-03, 2.2931e-04, 5.7456e-03, -4.8441e-03,
-9.0898e-03, 8.6298e-03, 5.4740e-03, 2.2274e-02, -2.1218e-02,
-2.6795e-02, -3.5337e-03, 1.0785e-02, 1.2475e-02, -6.1160e-03,
1.0729e-02, -9.7955e-03, 1.8543e-02, -6.0488e-03, -4.5744e-03,
2.7089e-03, 1.5632e-02, -1.2928e-02, -3.0778e-03, -1.0325e-02,
-7.9550e-03, -6.3065e-02, 2.1062e-02, -6.6717e-03, 8.4616e-03,
1.4475e-02, 1.1477e-01, -2.2838e-02, -3.7491e-02, -3.6218e-02,
-3.1994e-02, -8.9252e-03, 3.1720e-02, -1.1260e-02, -1.2980e-01,
-1.0315e-03, -4.7242e-03, -2.0092e-02, -9.4521e-01, -2.2178e-02,
-4.4297e-04, 1.9711e-02, 3.3402e-02, -1.0513e-02, 1.4492e-02,
-1.9697e-02, -9.8452e-03, -1.7347e-02, 2.3472e-02, 7.6570e-02,
1.9504e-02, 9.3617e-03, 8.2672e-03, -1.0471e-02, -1.9932e-03,
2.0000e-02, 2.0485e-02, 1.0977e-02, 1.7720e-02, 1.3532e-02,
7.3682e-03, 3.4906e-04, 1.8772e-03, 1.9976e-02, -3.2041e-02,
-8.9169e-03, 1.2900e-02, -1.3331e-02, 6.6207e-03, -5.7063e-03,
-1.1482e-02, 8.3907e-03, -6.4162e-03, 1.5816e-02, 7.8921e-03,
4.4177e-03, 2.2568e-02, 1.0239e-02, -3.0194e-04, 1.3294e-02,
-2.1606e-02, 3.8832e-03, 2.4475e-02, 4.3808e-02, -2.1031e-03,
-1.2163e-02, -4.0786e-02, 1.5565e-02, 1.4750e-02, 1.6645e-02,
2.8083e-02, 1.8920e-03, -1.4733e-04, -2.6208e-02, 2.3780e-02,
1.8657e-04, -2.2931e-03, 3.0334e-03, -1.7294e-02, -2.3001e-02,
8.6004e-03, -3.3497e-02, 2.5660e-02, -1.9225e-02, -2.7186e-02,
-2.1020e-02, -3.5213e-02, -1.8228e-03, -8.2840e-03, 1.1212e-02,
1.0387e-02, -3.4194e-01, -1.9705e-03, 1.1558e-02, 5.1976e-03,
7.4498e-03, 5.7142e-03, 2.8401e-02, -7.7551e-03, 1.0682e-02,
-1.2657e-02, -1.8065e-02, 2.6681e-03, 3.3947e-03, -4.5565e-02,
-2.1170e-02, -1.7830e-02, 3.4679e-03, -2.2051e-02, -5.4176e-03,
-1.1517e-02, -3.4155e-02, -3.0335e-03, -1.3915e-02, 6.2173e-03,
-1.1101e-02, -1.5308e-02, 9.2188e-03, -7.5665e-03, 6.5685e-03,
8.0935e-03, 3.1139e-03, -5.5047e-03, -3.1347e-02, 2.2140e-02,
1.0865e-02, -2.7849e-02, -4.9580e-03, 1.8804e-03, 1.0007e-01,
-1.8013e-03, -4.8792e-03, 1.5534e-02, -2.0179e-02, -1.2351e-02,
-1.3871e-02, 1.1439e-02, -9.0208e-03, 1.2580e-02, -2.5973e-02,
-2.0398e-02, -1.9464e-03, 4.3189e-03, 2.0707e-02, 5.0029e-03,
-1.0679e-02, 1.2298e-02, 1.0269e-02, 2.2228e-02, 2.9754e-02,
-2.6392e-03, 1.9286e-02, -1.5137e-02, 2.1914e-01, 1.3030e-02,
-7.4460e-03, -9.6818e-04, 2.9736e-02, 9.8722e-03, -5.6688e-03,
4.2518e-03, 1.8941e-02, -6.3909e-03, 8.0590e-03, -6.7893e-03,
6.0878e-03, -5.3970e-03, 7.5776e-04, 1.1374e-03, -5.0035e-03,
-1.6159e-03, 1.6764e-02, 9.1251e-03, 1.3020e-02, -1.0368e-02,
2.2141e-02, -2.5411e-03, -1.5227e-02, 2.3444e-02, 8.4076e-04,
-1.1465e-01, 2.7017e-03, -4.4961e-03, 2.9762e-04, -3.9612e-03,
8.9038e-05, 2.8683e-02, 5.0068e-03, 1.6509e-02, 7.8983e-04,
5.7728e-03, 3.2685e-02, -1.0457e-01, 1.2989e-02, 1.1278e-02,
1.1943e-02, 1.5258e-02, -6.2411e-04, 1.0682e-04, 1.2087e-02,
7.2984e-03, 2.7758e-02, 1.7572e-02, -6.0345e-03, 1.7211e-02,
1.4121e-02, 6.4663e-02, 9.1813e-03, 3.2555e-03, -3.2667e-02,
2.9132e-02, -1.7770e-02, 1.5302e-03, -2.9944e-02, -2.0706e-02,
-3.6528e-03, -1.5497e-02, 1.5223e-02, -1.4751e-02, -2.2381e-02,
6.9636e-03, -8.0838e-03, -2.4583e-03, -2.0677e-02, 8.8132e-03,
-6.9554e-04, 1.6965e-02, 1.8535e-01, 3.5843e-04, 1.0812e-02,
-4.2391e-03, 8.1779e-03, 3.4144e-02, -1.8996e-03, 2.9939e-03,
3.6898e-04, -1.0144e-02, -5.7416e-03, -5.7676e-03, 1.7565e-01,
-1.5793e-03, -2.6617e-02, -1.2572e-02, 3.0421e-04, -1.2132e-02,
-1.4168e-02, 1.2154e-02, 8.4700e-03, -1.6284e-02, 2.6983e-03,
-6.8554e-03, 2.7829e-01, 2.4060e-02, 1.1130e-02, 7.6095e-04,
3.1341e-01, 2.1668e-02, 1.0277e-02, -3.0065e-02, -8.3565e-03,
5.2488e-03, -1.1287e-02, -1.8266e-02, 1.1814e-02, 1.2662e-02,
2.9036e-04, 7.0254e-04, -1.4084e-02, 1.2925e-02, 3.9504e-03,
-7.9568e-03, 3.2794e-02, 7.3839e-03, 2.4609e-02, 9.6109e-03,
-8.7206e-03, 9.2571e-03, -3.5850e-03, -8.9996e-03, 2.3120e-03,
-1.8475e-02, -1.9610e-02, 1.1994e-02, 6.7156e-03, 1.9903e-02,
3.0703e-02, -4.9538e-03, -6.1673e-02, -6.4986e-03, -2.1317e-02,
-3.3650e-03, 2.3200e-03, -6.2224e-03, 3.7458e-03, 1.1542e-02,
-1.0181e-02, -8.4711e-03, 1.1603e-02, -5.6247e-03, -1.0220e-02,
-8.6501e-04, -1.2285e-02, -8.7487e-03, -1.1265e-02, 1.6322e-02,
1.5160e-02, 1.8882e-02, 5.1557e-03, -8.8616e-03, 4.2153e-03,
-1.9450e-02, -8.7365e-03, -9.7867e-03, 1.1667e-02, 5.0613e-03,
2.8221e-03, -7.1795e-03, 9.3306e-03, -4.9663e-02, 1.7708e-02,
-2.0959e-02, -3.3989e-02, 2.2581e-03, 5.1748e-03, -1.0133e-01,
2.1052e-03, 5.5644e-03, 1.3607e-03, 8.8388e-03, 1.0244e-02,
-3.8072e-03, 5.9209e-03, 6.7993e-03, 1.1594e-02, -1.1802e-02,
-2.4233e-03, -5.1504e-03, -1.1903e-02, 1.4075e-02, -4.0701e-03,
-2.9465e-02, -1.7579e-03, 4.3654e-03, 1.0429e-02, 3.7096e-02,
8.6493e-03, 1.5871e-02, 1.8034e-02, -3.2165e-03, -2.1941e-02,
2.6274e-02, -7.6941e-03, -5.9618e-03, -1.4179e-02, 8.0281e-03,
1.1293e-02, -6.6936e-05, 1.2899e-02, 1.0056e-02, -6.3919e-04,
2.0299e-02, 3.1528e-03, -4.8988e-03, 3.2754e-03, -1.1003e-01,
1.8414e-02, 2.2272e-03, -2.2185e-02, -4.8672e-03, 1.9643e-03,
3.0928e-02, -8.9599e-03, -1.1446e-02, -1.3794e-02, 7.1943e-03,
-5.8965e-03, 2.2605e-03, -2.6114e-02, -5.6616e-03, 6.5073e-03,
9.2219e-02, -6.7243e-03, 4.4427e-04, 7.2846e-03, -1.1021e-02,
7.8802e-04, -3.8878e-03, 1.0489e-02, 9.2883e-03, 1.8895e-02,
2.1808e-02, 6.2590e-04, -2.6519e-02, 7.0343e-04, -2.9067e-02,
-9.1515e-03, 1.0418e-03, 8.3222e-03, -8.7548e-03, -2.0637e-03,
-1.1450e-02, -8.8985e-04, -4.4062e-03, 2.3629e-02, -2.7221e-02,
3.2008e-02, 6.6325e-03, -1.1302e-02, -1.0138e-03, -1.6902e-01,
-8.4473e-03, 2.8536e-02, 1.4117e-03, -1.2136e-02, -1.4781e-02,
4.9960e-03, 3.3916e-02, 5.2710e-03, 1.7382e-02, -4.6315e-03,
1.1680e-02, -9.1395e-03, 1.8310e-02, 1.2321e-02, -2.4871e-02,
1.1535e-02, 5.0308e-03, 5.5028e-03, -7.2184e-03, -5.5210e-03,
1.7085e-02, 5.7236e-03, 1.7463e-03, 1.9969e-03, 6.1670e-03,
2.9347e-03, 1.3946e-02, -1.9984e-03, 1.0091e-02, 1.0388e-03,
-6.1902e-03, 3.0905e-02, 6.6038e-03, -9.1223e-02, -1.8411e-02,
5.4185e-03, 2.4396e-02, 1.5696e-02, -1.2742e-02, 1.8126e-02,
-2.6138e-02, 1.1170e-02, -1.3058e-02, -1.9386e-02, -5.9828e-03,
1.9176e-02, 1.9962e-03, -2.1538e-03, 3.3003e-02, 1.8407e-02,
-5.9498e-03, -3.2533e-03, -1.8917e-02, -1.5897e-02, -4.7057e-03,
5.4162e-03, -3.0037e-02, 8.6773e-03, -1.7942e-03, 6.6826e-03,
-1.1929e-02, -1.4076e-02, 1.6709e-02, 1.6860e-03, -3.3842e-03,
8.6805e-03, 7.1340e-03, 1.5147e-02], grad_fn=<EmbeddingBackward>)
来源:https://stackoverflow.com/questions/63126386/where-can-i-get-the-pretrained-word-embeddinngs-for-bert