# 二、使用siamese network进行文本相似度计算

## 2.2 数据处理

``````import numpy as np
import pandas as pd
import os
import math

def sentences_to_indices(X, word_to_index, max_len):
"""
把字符串数组转换为字符数值索引数组
:param X:string 数组
:param word_to_index:
:param max_len:最长的序列长度
:return:
"""
m = X.shape[0]
X_indices = np.zeros((m, max_len))
for i in range(m):
# split字符串
sentence_words = X[i].split(" ")
for j, w in enumerate(sentence_words):
if j >= max_len:
break
X_indices[i, j] = word_to_index[w]
return X_indices

"""
读取数据，对数据进行预处理，并生成embed_matrix
:param data_dir:数据目录
:param max_seq_len:
:param embed_dim:词向量维度
:param word_level:
:return:
"""
question_path = os.path.join(data_dir, "question.csv")
train_path = os.path.join(data_dir, "train.csv")
if word_level:
embed_path = os.path.join(data_dir, "word_embed.txt")  # 词向量
else:
embed_path = os.path.join(data_dir, "char_embed.txt")  # 字符向量

# 读取数据

# 把train里面的问题id匹配到句子
train = pd.merge(train, question, left_on=["q1"], right_on=["qid"], how="left")  # 匹配第一个问题
train = pd.merge(train, question, left_on=["q2"], right_on=["qid"], how="left")  # 匹配第二个问题

if word_level:
train = train[["label", "words_x", "words_y"]]
else:
train = train[["label", "chars_x", "chars_y"]]
train.columns = ["label", "q1", "q2"]

word = word_to_vec_map.index.values

# word2id,id2word
word_to_index = dict([(word[i], i+1) for i in range(len(word))])
index_to_word = dict([(i+1, word[i]) for i in range(len(word))])

train_q1_indices = sentences_to_indices(train.q1.values, word_to_index, max_seq_len)
train_q2_indices = sentences_to_indices(train.q2.values, word_to_index, max_seq_len)
label = train.label.values

vocab_len = len(word_to_index)+1
embed_matrix = np.zeros((vocab_len, embed_dim))
for word, index in word_to_index.items():
embed_matrix[index, :] = word_to_vec_map.loc[word].values

return train_q1_indices, train_q2_indices, label, embed_matrix, word_to_index, index_to_word

"""
读取测试数据
:param max_seq_len:
:param word_level:
:return:
"""
question_path = os.path.join(data_dir, "question.csv")
test_path = os.path.join(data_dir, "test.csv")
if word_level:
embed_path = os.path.join(data_dir, "word_embed.txt")
else:
embed_path = os.path.join(data_dir, "char_embed.txt")

# 读取数据

test = pd.merge(test, question, left_on=["q1"], right_on=["qid"], how="left")
test = pd.merge(test, question, left_on=["q2"], right_on=["qid"], how="left")

if word_level:
test = test[["words_x", "words_y"]]
else:
test = test[["chars_x", "chars_y"]]
test.columns = ["q1", "q2"]
word = word_to_vec_map.index.values

# word2id,id2word
word_to_index = dict([(word[i], i+1) for i in range(len(word))])
index_to_word = dict([(i+1, word[i]) for i in range(len(word))])

test_q1_indices = sentences_to_indices(test.q1.values, word_to_index, max_seq_len)
test_q2_indices = sentences_to_indices(test.q2.values, word_to_index, max_seq_len)
return test_q1_indices, test_q2_indices
``````

## 2.3 模型网络结构搭建

``````import numpy as np
import pandas as pd

np.random.seed(0)

from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, GaussianNoise, \
Input, Dropout, LSTM, Activation, BatchNormalization, concatenate, Subtract, Dot, Multiply, Bidirectional, Lambda
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras import optimizers
import tensorflow as tf
import tensorflow.keras.callbacks as kcallbacks

np.random.seed(1)
import warnings

warnings.filterwarnings("ignore")

MAX_SEQUENCE_LENGTH = 15  # 20 for character level and 15 for word level
EMBEDDING_DIM = 300
lstm_num = 64
lstm_drop = 0.5
BATCH_SIZE = 100

def trainLSTM(train_q1, train_q2, train_label, embed_matrix):
question1 = Input(shape=(MAX_SEQUENCE_LENGTH,), batch_size=BATCH_SIZE)
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,), batch_size=BATCH_SIZE)
embed_layer = Embedding(embed_matrix.shape[0], EMBEDDING_DIM,weights=[embed_matrix]) #
q1_embed = embed_layer(question1)
q2_embed = embed_layer(question2)

shared_lstm1 = LSTM(lstm_num, return_sequences=True)
shared_lstm2 = LSTM(lstm_num)

q1 = shared_lstm1(q1_embed)
q1 = Dropout(lstm_drop)(q1)
q1 = BatchNormalization()(q1)
q1 = shared_lstm2(q1)

q2 = shared_lstm1(q2_embed)
q2 = Dropout(lstm_drop)(q2)
q2 = BatchNormalization()(q2)
q2 = shared_lstm2(q2)

# 求distance (batch_size,lstm_num)
d = Subtract()([q1, q2])
distance = Multiply()([d, d])
# 求angle (batch_size,lstm_num)
angle = Multiply()([q1, q2])
merged = concatenate([distance, angle])
merged = Dropout(0.3)(merged)
merged = BatchNormalization()(merged)

merged = Dense(256, activation="relu")(merged)
merged = Dropout(0.3)(merged)
merged = BatchNormalization()(merged)

merged = Dense(64, activation="relu")(merged)
merged = Dropout(0.3)(merged)
merged = BatchNormalization()(merged)

res = Dense(1, activation="sigmoid")(merged)
model = Model(inputs=[question1, question2], outputs=res)
model.summary()

hist = model.fit([train_q1, train_q2],train_label,epochs=30, batch_size=BATCH_SIZE, validation_split=0.2,shuffle=True)

``````

## 2.4 模型训练

``````train_q1_indices, train_q2_indices, train_label, embed_matrix, word_to_index, index_to_word = load_dataset("/content/drive/My Drive/data/text_similarity/data", MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, False)
print('train_q1: ', train_q1_indices.shape)
print('train_q2: ', train_q2_indices.shape)
print('train_label: ', tf.one_hot(train_label,depth=2).shape)
print('embed_matrix: ', embed_matrix.shape)

# 加载test 数据
test_q1, test_q2 = load_test_data("/content/drive/My Drive/data/text_similarity/data", MAX_SEQUENCE_LENGTH, word_level=False)
print('test_q1: ', test_q1.shape)
print('test_q2: ', test_q2.shape)
print("word_to_index len:",len(word_to_index))
``````
``````trainLSTM(train_q1_indices[:243000], train_q2_indices[:243000], train_label[:243000], embed_matrix) #我这里使用的Colab，数据数量无法整除BATCH_SIZE时会报错
``````

[1] https://zhuanlan.zhihu.com/p/88938220

[2] https://www.jianshu.com/p/827dd447daf9

github代码:https://github.com/chongzicbo/nlp-ml-dl-notes/blob/master/code/text_similarity/NLP10%EF%BC%9Asiamese_text_similarity.ipynb