python - 将通用句子编码器保存到 Tflite 或将其提供给 tensorflow api
问题描述
我有这段代码用于使用预先构建的通用句子编码器查找句子相似度。它需要一个 .txt 文件作为输入。执行余弦相似度,然后接受用户的输出以根据用户输入查询找到最相似的句子。这是代码:
# tensroflow hub module for Universal sentence Encoder
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
def get_features(texts):
if type(texts) is str:
texts = [texts]
with tf.Session() as sess:
sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
return sess.run(embed(texts))
def remove_stopwords(stop_words, tokens):
res = []
for token in tokens:
if not token in stop_words:
res.append(token)
return res
def process_text(text):
text = text.encode('ascii', errors='ignore').decode()
text = text.lower()
text = re.sub(r'http\S+', ' ', text)
text = re.sub(r'#+', ' ', text )
text = re.sub(r'@[A-Za-z0-9]+', ' ', text)
text = re.sub(r"([A-Za-z]+)'s", r"\1 is", text)
#text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"won't", "will not ", text)
text = re.sub(r"isn't", "is not ", text)
text = re.sub(r"can't", "can not ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub('\W', ' ', text)
text = re.sub(r'\d+', ' ', text)
text = re.sub('\s+', ' ', text)
text = text.strip()
return text
def lemmatize(tokens):
lemmatizer = nltk.stem.WordNetLemmatizer()
lemma_list = []
for token in tokens:
lemma = lemmatizer.lemmatize(token, 'v')
if lemma == token:
lemma = lemmatizer.lemmatize(token)
lemma_list.append(lemma)
# return [ lemmatizer.lemmatize(token, 'v') for token in tokens ]
return lemma_list
def process_all(text):
text = process_text(text)
return ' '.join(remove_stopwords(stop_words, text.split()))
process_text("Hello! Who are you?")
with open('/content/sample_data/training.txt') as f:
... text = [i.strip() for i in f]
...
data_processed = list(map(process_text, text))
len(data_processed)
BASE_VECTORS = get_features(text)
def cosine_similarity(v1, v2):
mag1 = np.linalg.norm(v1)
mag2 = np.linalg.norm(v2)
if (not mag1) or (not mag2):
return 0
return np.dot(v1, v2) / (mag1 * mag2)
def test_similiarity(text1, text2):
vec1 = get_features(text1)[0]
vec2 = get_features(text2)[0]
print(vec1.shape)
return cosine_similarity(vec1, vec2)
def semantic_search(query, data, vectors):
query = process_text(query)
print("Extracting features...")
query_vec = get_features(query)[0].ravel()
res = []
for i, d in enumerate(data):
qvec = vectors[i].ravel()
sim = cosine_similarity(query_vec, qvec)
res.append((sim, d[:100], i))
return sorted(res, key=lambda x : x[0], reverse=True)
semantic_search("da vinci", data_processed, BASE_VECTORS)
我想保存模型并将其转换为 tflite。我进行了很多研究,但找不到任何解决方案。或者如何将它提供给 tensorflow api。
解决方案
继续进行的一种选择是以SavedModel 格式保存模型,然后将生成的模型转换为 tflite。请注意,转换模型的能力可能取决于模型正在使用的操作,并且某些模型架构可能无法转换为tflite 格式。
推荐阅读
- istio - 为什么 Istio 需要这么多听众
- javascript - 我想用给定次数的传播语法填充数组 [ ]
- vue.js - Nuxt 项目中没有布局文件夹
- python - GeoPandas, MatPlotLib, mapclassify plot with user defined bin
- mobx - MobX 观察者不会对可观察到的变化做出反应
- java - 如何在 IntelliJ IDEA 中添加库
- c# - 查找不在列表 A 但在列表 B 中的人员的更快方法
- rdkit - ECFP4指纹对应哪个RDKit指纹
- javascript - 如何清除反应 dropzone 文件输入以及成功图标并显示初始拖放
- java - 循环条件的未经检查的输入