線性伸縮(linear scaling, LS)法的主要精神,是假設你唱歌的節奏雖然會跟資料庫的節奏不太一樣,但是變化是均勻的,不會忽快忽慢;因此,我們可以將查詢輸入的音高序列,做出多個不同比率的拉伸或縮短版本後,再來進行比對。

我們先來看一個簡單的伸縮範例。以下範例會將輸入的音高向量進行伸縮,並且跟資料庫當中的音高向量並排展示,你可以目視觀察看看,是哪個倍率的伸縮版本,跟資料庫當中的音高向量最為相似:

import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d


def vec_lin_scaling(vec, lowerRatio, upperRatio, resolution):
	scaleFactor = np.linspace(lowerRatio, upperRatio, resolution)
	scaledVec = []
	func = interp1d(np.arange(vec.size), vec)
	for f in scaleFactor:
		newVec = func(np.linspace(0, vec.size-1, int(f*vec.size)))
		newVec -= np.mean(newVec)
		scaledVec.append(newVec)
	return scaledVec


inputPitch = np.array([48.04, 49.36, 50.13, 50.62, 51.11, 51.49, 51.49, 50.82, 50.15, 49.89, 50.67, 51.06, 49.9, 49.52, 49.52, 51.14, 51.14, 51.52, 51.27, 51.18, 52.02, 51.49, 51.33, 51.14, 51.14, 51.14, 51.14, 51.14, 50.55, 50.47, 50.64, 50.39, 48.35, 51.35, 51.49, 51.27, 50.89, 51.49, 51.49, 51.49, 55.78, 55.14, 54.92, 55.35, 55.35, 55.35, 55.35, 55.35, 54.01, 58.46, 59.63, 59.76, 59.76, 58.06, 57.99, 58.68, 58.68, 57.94, 55.06, 55.37, 55.79, 55.79, 54.73, 56.14, 55.35, 55.35, 55.04, 54.51, 53.29, 50.16, 50.9, 51.14, 51.07, 50.81, 50.54, 51.25, 51.49, 51.49, 51.49, 52.08, 52.56, 52.56, 52.2, 52.0, 53.23, 53.31, 52.98, 53.06, 53.0, 52.45, 52.77, 53.21, 52.14, 52.79, 53.18, 52.87, 52.79, 52.89, 52.56, 52.6, 53.31, 53.31, 53.13, 52.93, 52.77, 52.9, 52.93, 52.93, 52.93, 52.93, 53.31, 53.31, 52.19, 52.19, 53.88, 52.93, 52.93, 52.93, 51.14, 51.14, 48.94, 49.52, 49.84, 49.24, 49.48, 48.85, 48.33, 48.33, 48.33, 49.62, 52.58, 53.31, 53.03, 52.93, 53.19, 52.93, 52.69, 52.39, 52.2, 49.26, 50.01, 49.83, 49.08, 48.62, 49.16, 49.84, 49.84, 47.09, 46.68, 46.25, 45.66, 45.85, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 50.84, 51.49, 51.14])
dbPitch = 1.0 * np.array([60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 60, 60, 60, 60])

dbPitch -= np.mean(dbPitch)
scaledVecs = vec_lin_scaling(inputPitch, 0.5, 1.5, 5)

plt.plot(dbPitch+60)
plt.plot(scaledVecs[0]+50)
plt.plot(scaledVecs[1]+40)
plt.plot(scaledVecs[2]+30)
plt.plot(scaledVecs[3]+20)
plt.plot(scaledVecs[4]+10)
plt.xlabel('Vector Length')
plt.yticks(
	[10, 20, 30, 40, 50, 60],
	[
		'Stretched by 1.5', 'Stretched by 1.25', 'Original input pitch',
		'Compressed by 0.75', 'Compressed by 0.5', 'Target pitch'
	]
)

plt.figure()
plt.plot(dbPitch[:len(scaledVecs[3])], label='Target pitch')
plt.plot(scaledVecs[3], label='Scaled Input Pitch')
plt.legend()

plt.show()

在上述的程式碼中:

處理完了節奏快慢的問題後,接著還要處理 key 不同的問題,因為有可能你唱的是男(女)key,但是資料庫中的歌曲是女(男)key,因此我們需要「移調」。一般來說,我們可以把查詢輸入和資料庫中待比對的片段,都平移到音高的平均(或中位數)為 0 的位置;或者將資料庫中待比對的片段的平均(或中位數)計算出來,再將查詢輸入的音高向量平移過去,以便處理 key 不同的問題。並且由於你一定會需要計算資料庫中待比對的片段的平均(或中位數),所以若有加速需求且資源配置允許的話,可以用空間換取時間,事先把各段的平均算出來,就不用在每次比對的時候,重複計算一模一樣的東西。

因此,對於一個輸入查詢的某一個伸縮版本以及資料庫中的某一首歌曲之間,我們可以將兩邊重疊的部分,視作兩個向量來計算距離;而一個輸入查詢跟資料庫中的某一首歌曲之間,則可以用該輸入查詢的所有伸縮版本跟資料庫中的某一首歌曲之間的距離的最小值,來做為其距離。此處需要注意的是,如果你在進行移調時計算的是平均,則必須使用 2-norm,而若移調時使用的是中位數,則必須使用 1-norm,那麼在數學上才會是最短距離;但因為平均跟 1-norm 相對於中位數跟 2-norm 的組合來說,消耗的計算資源比較少一些,所以有些時候(特別是在需要計較速度的情況下),會用平均來移調以及用 1-norm 來計算距離。底下的範例,會對一個輸入查詢跟資料庫中的某一首歌曲之間計算距離,並顯示每個伸縮比以及其造成的距離:

import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d


def vec_lin_scaling(vec, lowerRatio, upperRatio, resolution):
	scaleFactor = np.linspace(lowerRatio, upperRatio, resolution)
	scaledVec = []
	func = interp1d(np.arange(vec.size), vec)
	for f in scaleFactor:
		newVec = func(np.linspace(0, vec.size-1, int(f*vec.size)))
		newVec -= np.mean(newVec)
		scaledVec.append(newVec)
	return scaledVec


def get_dist_for_one_db_song(query_vecs, scale_factors, db_song):
	min_dist = np.inf
	for f, vec in zip(scale_factors, query_vecs):
		min_len = min(len(vec), len(db_song))
		shifted_db_song = db_song[:min_len] - np.mean(db_song[:min_len])
		shifted_vec = vec[:min_len] - np.mean(vec[:min_len])
		dist = np.mean(np.abs(shifted_db_song - shifted_vec))
		min_dist = min(min_dist, dist)
		print(f, dist)
	return min_dist


inputPitch = np.array([48.04, 49.36, 50.13, 50.62, 51.11, 51.49, 51.49, 50.82, 50.15, 49.89, 50.67, 51.06, 49.9, 49.52, 49.52, 51.14, 51.14, 51.52, 51.27, 51.18, 52.02, 51.49, 51.33, 51.14, 51.14, 51.14, 51.14, 51.14, 50.55, 50.47, 50.64, 50.39, 48.35, 51.35, 51.49, 51.27, 50.89, 51.49, 51.49, 51.49, 55.78, 55.14, 54.92, 55.35, 55.35, 55.35, 55.35, 55.35, 54.01, 58.46, 59.63, 59.76, 59.76, 58.06, 57.99, 58.68, 58.68, 57.94, 55.06, 55.37, 55.79, 55.79, 54.73, 56.14, 55.35, 55.35, 55.04, 54.51, 53.29, 50.16, 50.9, 51.14, 51.07, 50.81, 50.54, 51.25, 51.49, 51.49, 51.49, 52.08, 52.56, 52.56, 52.2, 52.0, 53.23, 53.31, 52.98, 53.06, 53.0, 52.45, 52.77, 53.21, 52.14, 52.79, 53.18, 52.87, 52.79, 52.89, 52.56, 52.6, 53.31, 53.31, 53.13, 52.93, 52.77, 52.9, 52.93, 52.93, 52.93, 52.93, 53.31, 53.31, 52.19, 52.19, 53.88, 52.93, 52.93, 52.93, 51.14, 51.14, 48.94, 49.52, 49.84, 49.24, 49.48, 48.85, 48.33, 48.33, 48.33, 49.62, 52.58, 53.31, 53.03, 52.93, 53.19, 52.93, 52.69, 52.39, 52.2, 49.26, 50.01, 49.83, 49.08, 48.62, 49.16, 49.84, 49.84, 47.09, 46.68, 46.25, 45.66, 45.85, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 50.84, 51.49, 51.14])
dbPitch = 1.0 * np.array([60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 60, 60, 60, 60])
scaledVecs = vec_lin_scaling(inputPitch, 0.5, 1.5, 5)
min_dist = get_dist_for_one_db_song(scaledVecs, np.linspace(0.5, 1.5, 5), dbPitch)
print('Min dist:', min_dist)

在上述的程式碼中:

因此,對於一個輸入查詢,我們只要依照前述的方法,把它跟資料庫中的所有歌曲都比對過一遍,並依照它與資料庫中每首歌的最小距離進行遞增排序,就可以做為系統該回傳給使用者的結果候選清單:

import os

import numpy as np
import matplotlib.pyplot as plt
import pretty_midi
from scipy.interpolate import interp1d


def read_database(path, fs=31.25):
	with open(os.path.join(path, 'songList.txt'), 'r') as fin:
		cnt = fin.read().splitlines()
	db_song_names = [' '.join(line.split('\t')[1:3]) for line in cnt]
	midi_files = sorted(os.listdir(path))
	db_pitches = []
	for mf in midi_files:
		if not mf.endswith('.mid'):
			continue
		midi = pretty_midi.PrettyMIDI(os.path.join(path, mf))
		piano_roll = midi.get_piano_roll(fs=fs) # Shape: (semitone, time_step)
		pitches = np.argmax(piano_roll, axis=0)
		db_pitches.append(pitches)
	return db_pitches, db_song_names


def vec_lin_scaling(vec, lowerRatio, upperRatio, resolution):
	scaleFactor = np.linspace(lowerRatio, upperRatio, resolution)
	scaledVec = []
	func = interp1d(np.arange(vec.size), vec)
	for f in scaleFactor:
		newVec = func(np.linspace(0, vec.size-1, int(f*vec.size)))
		newVec -= np.mean(newVec)
		scaledVec.append(newVec)
	return scaledVec


def get_dist_for_one_db_song(query_vecs, db_song):
	min_dist = np.inf
	for vec in query_vecs:
		min_len = min(len(vec), len(db_song))
		shifted_db_song = db_song[:min_len] - np.mean(db_song[:min_len])
		shifted_vec = vec[:min_len] - np.mean(vec[:min_len])
		dist = np.mean(np.abs(shifted_db_song - shifted_vec))
		min_dist = min(min_dist, dist)
	return min_dist


def compare_to_whole_db(scaled_vecs, db_pitches):
	all_dists = [
		get_dist_for_one_db_song(scaled_vecs, db_pitch) for db_pitch in db_pitches
	]
	return np.argsort(all_dists)


input_pitch = np.array([48.04, 49.36, 50.13, 50.62, 51.11, 51.49, 51.49, 50.82, 50.15, 49.89, 50.67, 51.06, 49.9, 49.52, 49.52, 51.14, 51.14, 51.52, 51.27, 51.18, 52.02, 51.49, 51.33, 51.14, 51.14, 51.14, 51.14, 51.14, 50.55, 50.47, 50.64, 50.39, 48.35, 51.35, 51.49, 51.27, 50.89, 51.49, 51.49, 51.49, 55.78, 55.14, 54.92, 55.35, 55.35, 55.35, 55.35, 55.35, 54.01, 58.46, 59.63, 59.76, 59.76, 58.06, 57.99, 58.68, 58.68, 57.94, 55.06, 55.37, 55.79, 55.79, 54.73, 56.14, 55.35, 55.35, 55.04, 54.51, 53.29, 50.16, 50.9, 51.14, 51.07, 50.81, 50.54, 51.25, 51.49, 51.49, 51.49, 52.08, 52.56, 52.56, 52.2, 52.0, 53.23, 53.31, 52.98, 53.06, 53.0, 52.45, 52.77, 53.21, 52.14, 52.79, 53.18, 52.87, 52.79, 52.89, 52.56, 52.6, 53.31, 53.31, 53.13, 52.93, 52.77, 52.9, 52.93, 52.93, 52.93, 52.93, 53.31, 53.31, 52.19, 52.19, 53.88, 52.93, 52.93, 52.93, 51.14, 51.14, 48.94, 49.52, 49.84, 49.24, 49.48, 48.85, 48.33, 48.33, 48.33, 49.62, 52.58, 53.31, 53.03, 52.93, 53.19, 52.93, 52.69, 52.39, 52.2, 49.26, 50.01, 49.83, 49.08, 48.62, 49.16, 49.84, 49.84, 47.09, 46.68, 46.25, 45.66, 45.85, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 50.84, 51.49, 51.14])
db_pitches, db_song_names = read_database('MIR-QBSH/midiFile')

scaled_vecs = vec_lin_scaling(input_pitch, 0.5, 1.5, 5)
sorted_idx = compare_to_whole_db(scaled_vecs, db_pitches)
for idx in sorted_idx[:10]:
	print(db_song_names[idx])

你可以把上述固定寫死的 input_pitch,換成自己從音檔抽取出來的音高向量,就可以是一個簡單的哼唱選歌系統。如果有需要評估系統的辨識效果,也可以把簡介中提到的 top-n accuracy 或 MRR 實作出來,加到你的程式碼當中。