線上教材：音樂資訊分析

線性伸縮(linear scaling, LS)法的主要精神，是假設你唱歌的節奏雖然會跟資料庫的節奏不太一樣，但是變化是均勻的，不會忽快忽慢；因此，我們可以將查詢輸入的音高序列，做出多個不同比率的拉伸或縮短版本後，再來進行比對。

我們先來看一個簡單的伸縮範例。以下範例會將輸入的音高向量進行伸縮，並且跟資料庫當中的音高向量並排展示，你可以目視觀察看看，是哪個倍率的伸縮版本，跟資料庫當中的音高向量最為相似：
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d


def vec_lin_scaling(vec, lowerRatio, upperRatio, resolution):
	scaleFactor = np.linspace(lowerRatio, upperRatio, resolution)
	scaledVec = []
	func = interp1d(np.arange(vec.size), vec)
	for f in scaleFactor:
		newVec = func(np.linspace(0, vec.size-1, int(f*vec.size)))
		newVec -= np.mean(newVec)
		scaledVec.append(newVec)
	return scaledVec


inputPitch = np.array([48.04, 49.36, 50.13, 50.62, 51.11, 51.49, 51.49, 50.82, 50.15, 49.89, 50.67, 51.06, 49.9, 49.52, 49.52, 51.14, 51.14, 51.52, 51.27, 51.18, 52.02, 51.49, 51.33, 51.14, 51.14, 51.14, 51.14, 51.14, 50.55, 50.47, 50.64, 50.39, 48.35, 51.35, 51.49, 51.27, 50.89, 51.49, 51.49, 51.49, 55.78, 55.14, 54.92, 55.35, 55.35, 55.35, 55.35, 55.35, 54.01, 58.46, 59.63, 59.76, 59.76, 58.06, 57.99, 58.68, 58.68, 57.94, 55.06, 55.37, 55.79, 55.79, 54.73, 56.14, 55.35, 55.35, 55.04, 54.51, 53.29, 50.16, 50.9, 51.14, 51.07, 50.81, 50.54, 51.25, 51.49, 51.49, 51.49, 52.08, 52.56, 52.56, 52.2, 52.0, 53.23, 53.31, 52.98, 53.06, 53.0, 52.45, 52.77, 53.21, 52.14, 52.79, 53.18, 52.87, 52.79, 52.89, 52.56, 52.6, 53.31, 53.31, 53.13, 52.93, 52.77, 52.9, 52.93, 52.93, 52.93, 52.93, 53.31, 53.31, 52.19, 52.19, 53.88, 52.93, 52.93, 52.93, 51.14, 51.14, 48.94, 49.52, 49.84, 49.24, 49.48, 48.85, 48.33, 48.33, 48.33, 49.62, 52.58, 53.31, 53.03, 52.93, 53.19, 52.93, 52.69, 52.39, 52.2, 49.26, 50.01, 49.83, 49.08, 48.62, 49.16, 49.84, 49.84, 47.09, 46.68, 46.25, 45.66, 45.85, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 50.84, 51.49, 51.14])
dbPitch = 1.0 * np.array([60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 60, 60, 60, 60])

dbPitch -= np.mean(dbPitch)
scaledVecs = vec_lin_scaling(inputPitch, 0.5, 1.5, 5)

plt.plot(dbPitch+60)
plt.plot(scaledVecs[0]+50)
plt.plot(scaledVecs[1]+40)
plt.plot(scaledVecs[2]+30)
plt.plot(scaledVecs[3]+20)
plt.plot(scaledVecs[4]+10)
plt.xlabel('Vector Length')
plt.yticks(
	[10, 20, 30, 40, 50, 60],
	[
		'Stretched by 1.5', 'Stretched by 1.25', 'Original input pitch',
		'Compressed by 0.75', 'Compressed by 0.5', 'Target pitch'
	]
)

plt.figure()
plt.plot(dbPitch[:len(scaledVecs[3])], label='Target pitch')
plt.plot(scaledVecs[3], label='Scaled Input Pitch')
plt.legend()

plt.show()
在上述的程式碼中：

輸入的音高向量是從人的聲音當中抽取，因此易有跳動不穩定的現象；資料庫當中的音高於本範例中是從 MIDI 檔案中得來，因此非常穩定。

此處使用 scipy.interpolate 的 interp1d 來進行伸縮，它的用法是需要先用原本的 x 和 y 建立一個函式，然後將新的 x 輸入。以此例而言，我們可以將原本的 y 訂為要被伸縮的向量，原本的 x 是向量的索引值，即 0 到其長度減一，而新的 x 可以訂為伸縮後的虛擬索引值，其值的範圍是 0 到原版長度減一，但是數字的個數變為原本的點數乘以伸縮倍率，代表我們在相同的 x 範圍內，取了不一樣個數的資料點。

將查詢輸入的音高序列和資料庫的音高序列都平移到平均為 0 的原因，在此範例中主要是顯示上的方便；在實際比對上我們也需要將音高向量做平移，原因請見下段探討。

處理完了節奏快慢的問題後，接著還要處理 key 不同的問題，因為有可能你唱的是男（女）key，但是資料庫中的歌曲是女（男）key，因此我們需要「移調」。一般來說，我們可以把查詢輸入和資料庫中待比對的片段，都平移到音高的平均（或中位數）為 0 的位置；或者將資料庫中待比對的片段的平均（或中位數）計算出來，再將查詢輸入的音高向量平移過去，以便處理 key 不同的問題。並且由於你一定會需要計算資料庫中待比對的片段的平均（或中位數），所以若有加速需求且資源配置允許的話，可以用空間換取時間，事先把各段的平均算出來，就不用在每次比對的時候，重複計算一模一樣的東西。

因此，對於一個輸入查詢的某一個伸縮版本以及資料庫中的某一首歌曲之間，我們可以將兩邊重疊的部分，視作兩個向量來計算距離；而一個輸入查詢跟資料庫中的某一首歌曲之間，則可以用該輸入查詢的所有伸縮版本跟資料庫中的某一首歌曲之間的距離的最小值，來做為其距離。此處需要注意的是，如果你在進行移調時計算的是平均，則必須使用 2-norm，而若移調時使用的是中位數，則必須使用 1-norm，那麼在數學上才會是最短距離；但因為平均跟 1-norm 相對於中位數跟 2-norm 的組合來說，消耗的計算資源比較少一些，所以有些時候（特別是在需要計較速度的情況下），會用平均來移調以及用 1-norm 來計算距離。底下的範例，會對一個輸入查詢跟資料庫中的某一首歌曲之間計算距離，並顯示每個伸縮比以及其造成的距離：
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d


def vec_lin_scaling(vec, lowerRatio, upperRatio, resolution):
	scaleFactor = np.linspace(lowerRatio, upperRatio, resolution)
	scaledVec = []
	func = interp1d(np.arange(vec.size), vec)
	for f in scaleFactor:
		newVec = func(np.linspace(0, vec.size-1, int(f*vec.size)))
		newVec -= np.mean(newVec)
		scaledVec.append(newVec)
	return scaledVec


def get_dist_for_one_db_song(query_vecs, scale_factors, db_song):
	min_dist = np.inf
	for f, vec in zip(scale_factors, query_vecs):
		min_len = min(len(vec), len(db_song))
		shifted_db_song = db_song[:min_len] - np.mean(db_song[:min_len])
		shifted_vec = vec[:min_len] - np.mean(vec[:min_len])
		dist = np.mean(np.abs(shifted_db_song - shifted_vec))
		min_dist = min(min_dist, dist)
		print(f, dist)
	return min_dist


inputPitch = np.array([48.04, 49.36, 50.13, 50.62, 51.11, 51.49, 51.49, 50.82, 50.15, 49.89, 50.67, 51.06, 49.9, 49.52, 49.52, 51.14, 51.14, 51.52, 51.27, 51.18, 52.02, 51.49, 51.33, 51.14, 51.14, 51.14, 51.14, 51.14, 50.55, 50.47, 50.64, 50.39, 48.35, 51.35, 51.49, 51.27, 50.89, 51.49, 51.49, 51.49, 55.78, 55.14, 54.92, 55.35, 55.35, 55.35, 55.35, 55.35, 54.01, 58.46, 59.63, 59.76, 59.76, 58.06, 57.99, 58.68, 58.68, 57.94, 55.06, 55.37, 55.79, 55.79, 54.73, 56.14, 55.35, 55.35, 55.04, 54.51, 53.29, 50.16, 50.9, 51.14, 51.07, 50.81, 50.54, 51.25, 51.49, 51.49, 51.49, 52.08, 52.56, 52.56, 52.2, 52.0, 53.23, 53.31, 52.98, 53.06, 53.0, 52.45, 52.77, 53.21, 52.14, 52.79, 53.18, 52.87, 52.79, 52.89, 52.56, 52.6, 53.31, 53.31, 53.13, 52.93, 52.77, 52.9, 52.93, 52.93, 52.93, 52.93, 53.31, 53.31, 52.19, 52.19, 53.88, 52.93, 52.93, 52.93, 51.14, 51.14, 48.94, 49.52, 49.84, 49.24, 49.48, 48.85, 48.33, 48.33, 48.33, 49.62, 52.58, 53.31, 53.03, 52.93, 53.19, 52.93, 52.69, 52.39, 52.2, 49.26, 50.01, 49.83, 49.08, 48.62, 49.16, 49.84, 49.84, 47.09, 46.68, 46.25, 45.66, 45.85, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 50.84, 51.49, 51.14])
dbPitch = 1.0 * np.array([60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 60, 60, 60, 60])
scaledVecs = vec_lin_scaling(inputPitch, 0.5, 1.5, 5)
min_dist = get_dist_for_one_db_song(scaledVecs, np.linspace(0.5, 1.5, 5), dbPitch)
print('Min dist:', min_dist)
在上述的程式碼中：

同時顯示伸縮比跟距離，是為了讓各位方便跟第一個範例的目視觀察結果做對照。實務上，除非你有如本範例的研究觀察用途等特殊需求，否則只需要最後計算出的距離即可，不需要知道是哪個伸縮比造成的距離。

此處用的移調和距離計算的組合，是前述比較省計算資源的平均和 1-norm 的組合，你也可以改成數學上正確配對的組合，來觀察看看計算結果。

一般來說，資料庫的歌曲長度會遠長於輸入查詢，本範例的求取資料庫歌曲和輸入查詢的最小長度，屬於減少程式發生錯誤的機會的做法。你在實作自己的系統時，如果對兩邊的長度範圍很有信心，也可以略過相關計算。

為了減少各位執行範例所需的時間，伸縮比仍然設定為 0.5 到 1.5 之間取 5 個伸縮版本。實務上，可能會需要取至少 11 個伸縮版本，整個系統才會有堪用的準確度；當然你可以取更多伸縮版本，例如 31 或 51 個；取更多伸縮版本通常會讓準確度上升，但是當然也會花更久的時間。此外，伸縮比的上下界代表你假設查詢輸入不會比資料庫歌曲的節奏快或慢過某個程度，因此若有需要，你也可以調整看看這兩個值。

因為 MIR-1K 資料集的錄音都是從頭開始唱，而且也是為了減少各位執行範例所需的時間，所以本範例是從資料庫的開頭開始比對。若要處理「查詢輸入可能是從歌曲中間某處開始哼唱」的情況，則你可能會需要從 MIDI 檔中讀取出音符的起始位置，或者是進一步事先對資料庫標記出歌詞中每個字或者每一句的起始位置，來做為比對的起點。

因此，對於一個輸入查詢，我們只要依照前述的方法，把它跟資料庫中的所有歌曲都比對過一遍，並依照它與資料庫中每首歌的最小距離進行遞增排序，就可以做為系統該回傳給使用者的結果候選清單：
import os

import numpy as np
import matplotlib.pyplot as plt
import pretty_midi
from scipy.interpolate import interp1d


def read_database(path, fs=31.25):
	with open(os.path.join(path, 'songList.txt'), 'r') as fin:
		cnt = fin.read().splitlines()
	db_song_names = [' '.join(line.split('\t')[1:3]) for line in cnt]
	midi_files = sorted(os.listdir(path))
	db_pitches = []
	for mf in midi_files:
		if not mf.endswith('.mid'):
			continue
		midi = pretty_midi.PrettyMIDI(os.path.join(path, mf))
		piano_roll = midi.get_piano_roll(fs=fs) # Shape: (semitone, time_step)
		pitches = np.argmax(piano_roll, axis=0)
		db_pitches.append(pitches)
	return db_pitches, db_song_names


def vec_lin_scaling(vec, lowerRatio, upperRatio, resolution):
	scaleFactor = np.linspace(lowerRatio, upperRatio, resolution)
	scaledVec = []
	func = interp1d(np.arange(vec.size), vec)
	for f in scaleFactor:
		newVec = func(np.linspace(0, vec.size-1, int(f*vec.size)))
		newVec -= np.mean(newVec)
		scaledVec.append(newVec)
	return scaledVec


def get_dist_for_one_db_song(query_vecs, db_song):
	min_dist = np.inf
	for vec in query_vecs:
		min_len = min(len(vec), len(db_song))
		shifted_db_song = db_song[:min_len] - np.mean(db_song[:min_len])
		shifted_vec = vec[:min_len] - np.mean(vec[:min_len])
		dist = np.mean(np.abs(shifted_db_song - shifted_vec))
		min_dist = min(min_dist, dist)
	return min_dist


def compare_to_whole_db(scaled_vecs, db_pitches):
	all_dists = [
		get_dist_for_one_db_song(scaled_vecs, db_pitch) for db_pitch in db_pitches
	]
	return np.argsort(all_dists)


input_pitch = np.array([48.04, 49.36, 50.13, 50.62, 51.11, 51.49, 51.49, 50.82, 50.15, 49.89, 50.67, 51.06, 49.9, 49.52, 49.52, 51.14, 51.14, 51.52, 51.27, 51.18, 52.02, 51.49, 51.33, 51.14, 51.14, 51.14, 51.14, 51.14, 50.55, 50.47, 50.64, 50.39, 48.35, 51.35, 51.49, 51.27, 50.89, 51.49, 51.49, 51.49, 55.78, 55.14, 54.92, 55.35, 55.35, 55.35, 55.35, 55.35, 54.01, 58.46, 59.63, 59.76, 59.76, 58.06, 57.99, 58.68, 58.68, 57.94, 55.06, 55.37, 55.79, 55.79, 54.73, 56.14, 55.35, 55.35, 55.04, 54.51, 53.29, 50.16, 50.9, 51.14, 51.07, 50.81, 50.54, 51.25, 51.49, 51.49, 51.49, 52.08, 52.56, 52.56, 52.2, 52.0, 53.23, 53.31, 52.98, 53.06, 53.0, 52.45, 52.77, 53.21, 52.14, 52.79, 53.18, 52.87, 52.79, 52.89, 52.56, 52.6, 53.31, 53.31, 53.13, 52.93, 52.77, 52.9, 52.93, 52.93, 52.93, 52.93, 53.31, 53.31, 52.19, 52.19, 53.88, 52.93, 52.93, 52.93, 51.14, 51.14, 48.94, 49.52, 49.84, 49.24, 49.48, 48.85, 48.33, 48.33, 48.33, 49.62, 52.58, 53.31, 53.03, 52.93, 53.19, 52.93, 52.69, 52.39, 52.2, 49.26, 50.01, 49.83, 49.08, 48.62, 49.16, 49.84, 49.84, 47.09, 46.68, 46.25, 45.66, 45.85, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 46.16, 50.84, 51.49, 51.14])
db_pitches, db_song_names = read_database('MIR-QBSH/midiFile')

scaled_vecs = vec_lin_scaling(input_pitch, 0.5, 1.5, 5)
sorted_idx = compare_to_whole_db(scaled_vecs, db_pitches)
for idx in sorted_idx[:10]:
	print(db_song_names[idx])
你可以把上述固定寫死的 input_pitch，換成自己從音檔抽取出來的音高向量，就可以是一個簡單的哼唱選歌系統。如果有需要評估系統的辨識效果，也可以把簡介中提到的 top-n accuracy 或 MRR 實作出來，加到你的程式碼當中。