最近打的新春赛,在此记录一下比赛过程
train.py
import tensorflow_datasets as tfds
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import Sequential, layers
import numpy as np
import pandas as pd
tf.random.set_seed(22)
np.random.seed(33)
def load_data():
train_data = pd.read_csv('preprocess_train.csv').to_numpy()
return train_data[:, 1].astype(np.str), train_data[:, 0].astype(np.int32)
# 1 is ham and 0 is spam
x, y = load_data()
print('训练数据加载完成.')
max_review_len = 80
embedding_len = 100
units = 64
# preprocess (create encoder)
encoder = tfds.features.text.TokenTextEncoder.load_from_file('lib')
vocab_size = encoder.vocab_size
print('单词编码表加载完成.')
# tokenizer = tfds.features.text.Tokenizer()
# vocabulary_set = set()
# for text in x:
# some_tokens = tokenizer.tokenize(text)
# vocabulary_set.update(some_tokens)
#
# encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)
# vocab_size = encoder.vocab_size
# encoder.save_to_file('lib')
def encode(text_tensor, label):
encoded_text = encoder.encode(text_tensor.numpy())
encoded_text = keras.preprocessing.sequence.pad_sequences([encoded_text], maxlen=max_review_len)
return encoded_text[0], label
def encode_map_fn(text, label):
# py_func doesn't set the shape of the returned tensors.
encoded_text, label = tf.py_function(encode,
inp=[text, label],
Tout=(tf.int32, tf.int32))
# `tf.data.Datasets` work best if all components have a shape set
# so set the shapes manually:
encoded_text.set_shape([max_review_len])
label.set_shape([])
return encoded_text, label
db = tf.data.Dataset.from_tensor_slices((x, y)).map(encode_map_fn).shuffle(10000).batch(32, drop_remainder=True)
model = Sequential([
layers.Embedding(vocab_size + 10, embedding_len, input_length=max_review_len),
layers.LSTM(units, dropout=0.5, return_sequences=True, unroll=True),
layers.LSTM(units, dropout=0.5, unroll=True),
layers.Dense(1, activation=tf.sigmoid)
])
model.compile(optimizer=keras.optimizers.Adam(0.001),
loss=tf.losses.BinaryCrossentropy(),
metrics=['accuracy'])
model.summary()
print('模型构建完毕,训练开始.')
model.fit(db, epochs=5)
model.save_weights('ckpt/weights.ckpt')
print('训练完毕,模型已保存.')
test.py
import tensorflow_datasets as tfds
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import Sequential, layers
import numpy as np
import pandas as pd
def load_data():
test_data = pd.read_csv('preprocess_test.csv').to_numpy()
return test_data[:, 1].astype(np.str), test_data[:, 0].astype(np.int32)
# 1 is ham and 0 is spam
x_test, y_test = load_data()
max_review_len = 80
embedding_len = 100
units = 64
# preprocess (create encoder)
encoder = tfds.features.text.TokenTextEncoder.load_from_file('lib')
vocab_size = encoder.vocab_size
print('单词编码表加载完成.')
def encode(text_tensor, label):
encoded_text = encoder.encode(text_tensor.numpy())
encoded_text = keras.preprocessing.sequence.pad_sequences([encoded_text], maxlen=max_review_len)
return encoded_text[0], label
def encode_map_fn(text, label):
# py_func doesn't set the shape of the returned tensors.
encoded_text, label = tf.py_function(encode,
inp=[text, label],
Tout=(tf.int32, tf.int32))
# `tf.data.Datasets` work best if all components have a shape set
# so set the shapes manually:
encoded_text.set_shape([max_review_len])
label.set_shape([])
return encoded_text, label
db_test = tf.data.Dataset.from_tensor_slices((x_test, y_test)).map(encode_map_fn).batch(32, drop_remainder=True)
print('模型加载中...')
model = Sequential([
layers.Embedding(vocab_size + 10, embedding_len, input_length=max_review_len),
layers.LSTM(units, dropout=0.5, return_sequences=True, unroll=True),
layers.LSTM(units, dropout=0.5, unroll=True),
layers.Dense(1, activation=tf.sigmoid)
])
model.compile(optimizer=keras.optimizers.Adam(0.001),
loss=tf.losses.BinaryCrossentropy(),
metrics=['accuracy'])
model.load_weights('ckpt/weights.ckpt')
print('模型加载完成.')
score = model.evaluate(db_test)
print('测试结束,准确率:', score[1])
predict.py
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import Sequential, layers
import tensorflow as tf
# 待预测文件路径
file_path = 'data/test.txt'
max_review_len = 80
embedding_len = 100
units = 64
# preprocess (create encoder)
encoder = tfds.features.text.TokenTextEncoder.load_from_file('lib')
vocab_size = encoder.vocab_size
print('单词编码表加载完成.')
x = []
data = open(file_path, 'r').read().split('\n')
for i in data:
x.append(encoder.encode(i.lower()))
x = keras.preprocessing.sequence.pad_sequences(x, maxlen=max_review_len)
print('数据加载完成')
print('模型加载中...')
model = Sequential([
layers.Embedding(vocab_size + 10, embedding_len, input_length=max_review_len),
layers.LSTM(units, dropout=0.5, return_sequences=True, unroll=True),
layers.LSTM(units, dropout=0.5, unroll=True),
layers.Dense(1, activation=tf.sigmoid)
])
model.compile(optimizer=keras.optimizers.Adam(0.001),
loss=tf.losses.BinaryCrossentropy(),
metrics=['accuracy'])
model.load_weights('ckpt/weights.ckpt')
print('模型加载完成,预测中...')
pred = model.predict(x)
print('预测完成,概率写入中...')
fp = open('predict_result.txt', 'w')
for i in pred:
fp.write('%f\n' % (1 - i))
print('概率写入完成.')