diff --git a/lpcnetfreedv-private_libs.patch b/lpcnetfreedv-private_libs.patch
deleted file mode 100644
index 4b79232..0000000
--- a/lpcnetfreedv-private_libs.patch
+++ /dev/null
@@ -1,15 +0,0 @@
-diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
-index 6c49f5e..a0b9f0a 100644
---- a/src/CMakeLists.txt
-+++ b/src/CMakeLists.txt
-@@ -29,8 +29,8 @@ target_include_directories(lpcnetfreedv INTERFACE
-     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
- )
- install(TARGETS lpcnetfreedv EXPORT lpcnetfreedv-config
--    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
--    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/lpcnetfreedv
-+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}/lpcnetfreedv
-     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-     PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/lpcnet
- )
diff --git a/lpcnetfreedv-test.patch b/lpcnetfreedv-test.patch
new file mode 100644
index 0000000..eb94af1
--- /dev/null
+++ b/lpcnetfreedv-test.patch
@@ -0,0 +1,1486 @@
+diff --git a/.travis.yml b/.travis.yml
+index fb795aa..e92c398 100644
+--- a/.travis.yml
++++ b/.travis.yml
+@@ -29,7 +29,7 @@ script:
+     - cd src && sox ../../wav/wia.wav -t raw -r 16000 - | ./lpcnet_enc -s | ./lpcnet_dec -s > /dev/null
+     # some LPCNet ctests
+     - ls -l
+-    - cd $BUILDDIR && ctest
++    - cd $BUILDDIR && ctest --output-on-failure
+     # Re-build codec2 with LPCNet and test FreeDV 2020 support
+     - cd $CODEC2DIR/build_linux
+     - make clean
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 680f52c..1d5b623 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -10,6 +10,7 @@ project(LPCNet C)
+ option(DISABLE_CPU_OPTIMIZATION "Disable CPU optimization discovery." OFF)
+ option(AVX2 "Enable AVX2 CPU optimizations." OFF)
+ option(AVX "Enable AVX CPU optimizations." OFF)
++option(SSE "Enable SSE CPU optimizations." OFF)
+ option(NEON "Enable NEON CPU optimizations for RPi." OFF)
+ 
+ include(GNUInstallDirs)
+@@ -19,6 +20,11 @@ mark_as_advanced(CLEAR
+     CMAKE_INSTALL_LIBDIR
+ )
+ 
++# Build universal ARM64 and x86_64 binaries on Mac.
++if(BUILD_OSX_UNIVERSAL)
++set(CMAKE_OSX_ARCHITECTURES "x86_64;arm64")
++endif(BUILD_OSX_UNIVERSAL)
++
+ #
+ # Prevent in-source builds
+ # If an in-source build is attempted, you will still need to clean up a few
+@@ -43,15 +49,41 @@ set(LPCNET_VERSION_MINOR 2)
+ set(LPCNET_VERSION_PATCH FALSE)
+ set(LPCNET_VERSION "${LPCNET_VERSION_MAJOR}.${LPCNET_VERSION_MINOR}")
+ # Patch level version bumps should not change API/ABI.
+-set(SOVERSION "${LPCNET_VERSION_MAJOR}.${LPCNET_VERSION_MINOR}")
++set(LPCNET_SOVERSION "${LPCNET_VERSION_MAJOR}.${LPCNET_VERSION_MINOR}")
+ if(LPCNET_VERSION_PATCH)
+     set(LPCNET_VERSION "${LPCNET_VERSION}.${LPCNET_VERSION_PATCH}")
+ endif()
+ message(STATUS "LPCNet version: ${LPCNET_VERSION}")
+ 
++#
++# Find the git hash if this is a working copy.
++#
++if(EXISTS ${CMAKE_SOURCE_DIR}/.git)
++    find_package(Git QUIET)
++    if(Git_FOUND)
++        execute_process(
++            COMMAND "${GIT_EXECUTABLE}" describe --always HEAD
++            WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
++            RESULT_VARIABLE res
++            OUTPUT_VARIABLE FREEDV_HASH
++            ERROR_QUIET
++            OUTPUT_STRIP_TRAILING_WHITESPACE)
++        message(STATUS "freedv-gui current git hash: ${FREEDV_HASH}")
++        add_definitions(-DGIT_HASH="${FREEDV_HASH}")
++    else()
++        message(WARNING "Git not found. Can not determine current commit hash.")
++        add_definitions(-DGIT_HASH="Unknown")
++    endif()
++else()
++        add_definitions(-DGIT_HASH="None")
++endif()
++
+ # Set default flags
+ set(CMAKE_C_FLAGS "-Wall -W -Wextra -Wno-unused-function -O3 -g -I. -MD ${CMAKE_C_FLAGS} -DENABLE_ASSERTIONS")
+ 
++# Arch specific stuff here
++message(STATUS "Host system arch is: ${CMAKE_SYSTEM_PROCESSOR}")
++
+ # Detection of available CPU optimizations
+ if(NOT DISABLE_CPU_OPTIMIZATION)
+     if(UNIX AND NOT APPLE)
+@@ -60,15 +92,25 @@ if(NOT DISABLE_CPU_OPTIMIZATION)
+             OUTPUT_VARIABLE AVX2)
+         execute_process(COMMAND grep -c "avx " /proc/cpuinfo
+             OUTPUT_VARIABLE AVX)
++        execute_process(COMMAND grep -c "sse4_1 " /proc/cpuinfo
++            OUTPUT_VARIABLE SSE)
+         execute_process(COMMAND grep -c "neon" /proc/cpuinfo
+             OUTPUT_VARIABLE NEON)
+     elseif(APPLE)
+-        # Under OSX we need to look through a few sysctl entries to determine what our CPU supports.
+-        message(STATUS "Looking for available CPU optimizations on an OSX system...")
+-        execute_process(COMMAND sysctl -a COMMAND grep machdep.cpu.leaf7_features COMMAND grep -c AVX2
+-            OUTPUT_VARIABLE AVX2)
+-        execute_process(COMMAND sysctl -a COMMAND grep machdep.cpu.features COMMAND grep -c AVX
+-            OUTPUT_VARIABLE AVX)
++        if(BUILD_OSX_UNIVERSAL)
++            # Presume AVX/AVX2 are enabled on the x86 side. The ARM side will auto-enable
++            # NEON optimizations by virtue of being aarch64.
++            set(AVX TRUE)
++            set(AVX2 TRUE)
++            set(SSE TRUE)
++        else()
++            # Under OSX we need to look through a few sysctl entries to determine what our CPU supports.
++            message(STATUS "Looking for available CPU optimizations on an OSX system...")
++            execute_process(COMMAND sysctl -a COMMAND grep machdep.cpu.leaf7_features COMMAND grep -c AVX2
++                OUTPUT_VARIABLE AVX2)
++            execute_process(COMMAND sysctl -a COMMAND grep machdep.cpu.features COMMAND grep -c AVX
++                OUTPUT_VARIABLE AVX)
++        endif(BUILD_OSX_UNIVERSAL)
+     elseif(WIN32)
+         message(STATUS "No detection capability on Windows, assuming AVX is available.")
+         set(AVX TRUE)
+@@ -85,9 +127,13 @@ elseif(${AVX} OR ${AVX} GREATER 0)
+ # AVX2 machines will also match on AVX
+     message(STATUS "avx processor flags found or enabled.")
+     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
++elseif(${SSE} OR ${SSE} GREATER 0)
++# AVX and AVX2 machines will also match on SSE
++    message(STATUS "sse processor flags found or enabled.")
++    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1")
+ endif()
+ 
+-# RPi
++# RPi / ARM 32bit
+ if(${NEON} OR ${NEON} GREATER 0)
+     message(STATUS "neon processor flags found or enabled.")
+     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=armv8-a -mtune=cortex-a53")
+diff --git a/README.md b/README.md
+index c446450..5b72d8c 100644
+--- a/README.md
++++ b/README.md
+@@ -25,14 +25,22 @@ LPCNet at 1733 bits/s using direct-split quantiser:
+ ```
+ sox ../../wav/wia.wav -t raw -r 16000 - | ./lpcnet_enc -s | ./lpcnet_dec -s | aplay -f S16_LE -r 16000
+ ```
+-# CTests
++
++## Manually Selecting SIMD Technology
++
++Cmake will select the fastest SIMD available (AVX/SSSE/None), however you can manually select e.g.:
++```
++make -DDISABLE_CPU_OPTIMIZATION=ON -DSSE=ON -DCODEC2_BUILD_DIR=~/codec2/build_linux ..
++```
++
++## CTests
+ 
+ ```
+ $ cd ~/LPCNet/build_linux
+ $ ctest
+ ```
+ 
+-Note, due to precision/library issues several tests (1-3) will only pass on certain machines such as Ubuntu 16 and 18, Ubuntu 17 is known to fail.
++Note, due to precision/library issues several tests (1-3) will [only pass on some machines](https://github.com/drowe67/LPCNet/issues/17).
+ 
+ # Reading Further
+ 
+diff --git a/src/700c_train.sh b/src/700c_train.sh
+new file mode 100755
+index 0000000..3be057e
+--- /dev/null
++++ b/src/700c_train.sh
+@@ -0,0 +1,73 @@
++#!/bin/bash -x
++# 700c_train.sh
++# David Rowe March 2020
++# Experiments in LPCNet decoding of Codec 2 700C
++
++PATH=$HOME/codec2/build_linux/src:$HOME/LPCNet/build_linux/src:$HOME/LPCNet/src:$PATH
++
++if [ "$#" -ne 1 ]; then
++    echo "usage: ./700c_train.sh datestamp"
++    echo "       ./700c_train.sh 200404"
++    exit 0
++fi
++
++train1=dev-clean-8k
++test1=test-clean-8k
++test2=all_speech_subset_8k
++test3=all_8k
++datestamp=$1
++epochs=30
++log=${1}.txt
++train=${datestamp}_train
++
++# synth "c2sim arg for experiment" "experiment label" "filename"
++synth() {
++    test=$3
++    c2sim ~/Downloads/${test}.sw --rateKWov ${test}.f32 ${1}
++    test_lpcnet --mag 2 --frame_size 80 --pre 0 ${test}.f32 ${datestamp}_${test}_${2}.sw
++}
++
++# experient "c2sim arg for experiment" "experiment label"
++experiment() {
++    echo "------------------------------------------------------------------------------"
++    echo "train starting" ${2}
++    echo "------------------------------------------------------------------------------"
++    
++    c2sim ${train}.sw --ten_ms_centre ${train}_10ms.sw --rateKWov ${train}.f32 ${1}
++    sw2packedulaw --frame_size 80 ${train}_10ms.sw ${train}.f32 ${train}_10ms.pulaw
++
++    train_lpcnet.py ${train}.f32 ${train}_10ms.pulaw ${datestamp}_${2} --epochs ${epochs} --frame_size 80
++    
++    dump_lpcnet.py ${datestamp}_${2}_${epochs}.h5
++    cp nnet_data.c src
++    make test_lpcnet
++
++    synth "${1}" "${2}" "${test1}"
++    synth "${1}" "${2}" "${test2}"
++    synth "${1}" "${2}" "${test3}"
++}
++
++rm -f $log
++
++(
++    date
++
++    # assemble some training speech
++    sox -r 8000 -c 1 ~/Downloads/${train1}.sw \
++	-t sw -r 8000 -c 1 ${train}.sw    
++
++    # LPCNet with 10ms frames (similar to training data) 
++    experiment "" "none"
++    
++    # Codec 2 700C at 40ms frame rate (700 bits/s) from c2dec
++    c2enc 700C ~/Downloads/${test1}.sw - --eq --var | c2dec 700C - /dev/null --mlfeat ${test1}_dec4.f32
++    test_lpcnet --mag 2 --frame_size 80 --pre 0 ${test1}_dec4.f32 ${datestamp}_${test1}_40.sw
++    c2enc 700C ~/Downloads/${test2}.sw - --eq --var | c2dec 700C - /dev/null --mlfeat ${test2}_dec4.f32
++    test_lpcnet --mag 2 --frame_size 80 --pre 0 ${test2}_dec4.f32 ${datestamp}_${test2}_40.sw
++    c2enc 700C ~/Downloads/${test3}.sw - --eq --var | c2dec 700C - /dev/null --mlfeat ${test3}_dec4.f32
++    test_lpcnet --mag 2 --frame_size 80 --pre 0 ${test3}_dec4.f32 ${datestamp}_${test3}_40.sw
++    
++    date
++) |& tee $log
++
++
+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index 41a78dc..0df4672 100644
+--- a/src/CMakeLists.txt
++++ b/src/CMakeLists.txt
+@@ -23,6 +23,8 @@ add_library(lpcnetfreedv SHARED ${lpcnet_freedv_srcs})
+ target_link_libraries(lpcnetfreedv codec2)
+ set_target_properties(lpcnetfreedv PROPERTIES
+     PUBLIC_HEADER lpcnet_freedv.h
++	VERSION ${LPCNET_VERSION}
++	SOVERSION ${LPCNET_SOVERSION}
+ )
+ target_include_directories(lpcnetfreedv INTERFACE
+     $<INSTALL_INTERFACE:include/lpcnet>
+@@ -49,11 +51,11 @@ target_link_libraries(dump_data lpcnetfreedv m codec2)
+ add_executable(test_lpcnet test_lpcnet.c)
+ target_link_libraries(test_lpcnet lpcnetfreedv m codec2)
+ 
+-if(AVX OR AVX2)
++if(SSE OR AVX OR AVX2 OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+     add_executable(test_vec test_vec.c)
+     target_link_libraries(test_vec m)
+ else()
+-    message(WARNING "No AVX/AVX2 CPU flags identified, not building test_vec.")
++    message(WARNING "No SSE/AVX/AVX2 CPU flags identified, not building test_vec.")
+ endif()
+ 
+ add_executable(quant_feat quant_feat.c)
+@@ -98,6 +100,12 @@ target_link_libraries(idct lpcnetfreedv m codec2)
+ add_executable(nnet2f32 nnet2f32.c)
+ target_link_libraries(nnet2f32 lpcnetfreedv m)
+ 
++add_executable(sw2packedulaw sw2packedulaw.c)
++target_link_libraries(sw2packedulaw lpcnetfreedv m)
++
++add_executable(thash thash.c)
++target_link_libraries(thash lpcnetfreedv m)
++
+ install(TARGETS lpcnet_enc lpcnet_dec
+     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+     )
+diff --git a/src/codec2_pitch.c b/src/codec2_pitch.c
+index a267785..55fb5bc 100644
+--- a/src/codec2_pitch.c
++++ b/src/codec2_pitch.c
+@@ -113,6 +113,7 @@ int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing)
+ 
+ void codec2_pitch_destroy(CODEC2_PITCH *pitch)
+ {
++    free(pitch->fft_fwd_cfg);
+     nlp_destroy(pitch->nlp_states);
+     free(pitch->w);
+     free(pitch);
+diff --git a/src/concat.sh b/src/concat.sh
+old mode 100644
+new mode 100755
+index 8369117..d98ccda
+--- a/src/concat.sh
++++ b/src/concat.sh
+@@ -1,6 +1,8 @@
+-# Place in 16k-LP7 from TSPSpeech.iso and run to concatenate wave files
+-# into one headerless training file
+-for i in */*.wav
++#!/bin/bash
++# Concatenate .wav files into one headerless .sw training file
++# usage: ./concat.sh concatfile.sw
++
++for i in `find . -name '*.wav'`
+ do
+ sox $i -r 16000 -c 1 -t sw -
+-done > input.s16
++done > $1
+diff --git a/src/dump_data.c b/src/dump_data.c
+index cd936cf..4e8d3c4 100644
+--- a/src/dump_data.c
++++ b/src/dump_data.c
+@@ -453,6 +453,7 @@ int main(int argc, char **argv) {
+ 	assert(pitch_index < 2*PITCH_MAX_PERIOD);
+ 	assert(pitch_index >= 2*PITCH_MIN_PERIOD);
+         features[2*NB_BANDS] = 0.01*(pitch_index-200);
++	//fprintf(stderr, "count: %d [36] %f pitch_index: %d\n", count, features[36], pitch_index);
+         if (c2voicing_en) features[2*NB_BANDS+1] = voicing;
+     }
+     fwrite(features, sizeof(float), NB_FEATURES, ffeat);
+diff --git a/ext_pitch.sh b/src/ext_pitch.sh
+similarity index 100%
+rename from ext_pitch.sh
+rename to src/ext_pitch.sh
+diff --git a/src/flac_to_wav.sh b/src/flac_to_wav.sh
+new file mode 100755
+index 0000000..8f8aa29
+--- /dev/null
++++ b/src/flac_to_wav.sh
+@@ -0,0 +1,10 @@
++#!/bin/bash
++# Convert all .flac files under this folder to .wav files
++# source: several GitHub repos
++
++find . -iname "*.flac" | wc
++
++for flacfile in `find . -iname "*.flac"`
++do
++    ffmpeg -y -f flac -i $flacfile -ab 64k -ac 1 -ar 16000 -f wav "${flacfile%.*}.wav"
++done
+diff --git a/src/freq.c b/src/freq.c
+index c88d071..dbe94d9 100644
+--- a/src/freq.c
++++ b/src/freq.c
+@@ -140,6 +140,13 @@ static void check_init() {
+   common.init = 1;
+ }
+ 
++void freq_close() {
++    if (common.init) {
++        opus_fft_free(common.kfft,0);
++        common.init = 0;
++    }
++}
++
+ void dct(float *out, const float *in) {
+   int i;
+   check_init();
+diff --git a/src/freq.h b/src/freq.h
+index 0316edd..314eabd 100644
+--- a/src/freq.h
++++ b/src/freq.h
+@@ -42,6 +42,7 @@
+ 
+ #define NB_BANDS 18
+ 
++void freq_close(void);
+ void compute_band_energy(float *bandE, const kiss_fft_cpx *X);
+ void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P);
+ 
+diff --git a/src/lpcnet.c b/src/lpcnet.c
+index e117f1c..9f3f059 100644
+--- a/src/lpcnet.c
++++ b/src/lpcnet.c
+@@ -54,8 +54,10 @@ struct LPCNetState {
+     float old_lpc[FEATURES_DELAY][LPC_ORDER];
+     float old_gain[FEATURES_DELAY];
+     int frame_count;
++    float preemph;
+     float deemph_mem;
+-    FILE *ftest;                    /* used to dump states for automates tests */
++    int   pitch_embedding;
++    FILE *ftest;                    /* used to dump states for automated tests */
+ };
+ 
+ 
+@@ -118,6 +120,8 @@ LPCNetState *lpcnet_create()
+     lpcnet = (LPCNetState *)calloc(sizeof(LPCNetState), 1);
+     lpcnet->last_exc = 128;
+     lpcnet->ftest = NULL;
++    lpcnet->preemph = PREEMPH;
++    lpcnet->pitch_embedding = 1;
+     return lpcnet;
+ }
+ 
+@@ -135,7 +139,15 @@ void lpcnet_open_test_file(LPCNetState *lpcnet, char file_name[]) {
+     }
+ }
+ 
+-void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features, int N, int logmag)
++void lpcnet_set_preemph(LPCNetState *lpcnet, float preemph) {
++    lpcnet->preemph = preemph;
++}
++
++void lpcnet_set_pitch_embedding(LPCNetState *lpcnet, int val) {
++    lpcnet->pitch_embedding = val;
++}
++
++void lpcnet_synthesize(LPCNetState *lpcnet, short *output, float *features, int N, int mag)
+ {
+     static int count = 0;
+     int i;
+@@ -149,13 +161,19 @@ void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features
+     static int start = 0; /*(LPC_ORDER+1*/;
+     /* FIXME: Do proper rounding once the Python code rounds properly. */
+ 
+-    pitch = (int)floor(.1 + 50*features[36]+100);    
+-    assert(pitch >=0); assert(pitch <= 255);    
+-    /* latest networks (using the codec 2 pitch estimator) are trained
+-       with pitch estimates between 40 and 255, but due to the pitch
+-       quantiser design and bit errors it's possible to get pitch
+-       values down to 32, which upsets the pitch embed matrix */
+-    if (pitch < 40) pitch = 40;
++    if (lpcnet->pitch_embedding) {
++	pitch = (int)floor(.1 + 50*features[36]+100);
++	//fprintf(stderr, "count: %d [36] %f pitch: %d\n", lpcnet->frame_count, features[36], pitch);
++	assert(pitch >=0); assert(pitch <= 255);    
++	/* latest networks (using the codec 2 pitch estimator) are trained
++	   with pitch estimates between 40 and 255, but due to the pitch
++	   quantiser design and bit errors it's possible to get pitch
++	   values down to 32, which upsets the pitch embed matrix */
++	if (pitch < 40) pitch = 40;
++    }
++    else {
++	pitch = 0;
++    }
+     
+     pitch_gain = lpcnet->old_gain[FEATURES_DELAY-1];
+     memmove(&lpcnet->old_gain[1], &lpcnet->old_gain[0], (FEATURES_DELAY-1)*sizeof(lpcnet->old_gain[0]));
+@@ -164,13 +182,30 @@ void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features
+     memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0]));
+     memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0]));
+ 
+-    if (logmag) {
+-        float tmp[NB_BANDS];
++    switch (mag) {
++    case 0:
++	lpc_from_cepstrum(lpcnet->old_lpc[0], features);
++	break;
++    case 1:
++    {
++	float tmp[NB_BANDS];
+         for (i=0;i<NB_BANDS;i++) tmp[i] = pow(10.f, features[i]);
+         lpc_from_bands(lpcnet->old_lpc[0], tmp);
+     }
+-    else
+-	lpc_from_cepstrum(lpcnet->old_lpc[0], features);
++	break;
++    case 2:
++        for (i=0;i<LPC_ORDER;i++) {
++	    lpcnet->old_lpc[0][i] = features[i+NB_BANDS];
++	}
++	break;
++    default:
++	assert(0);
++    }
++
++    /* We optinally use this part of feature vector to pass in LPCs,
++     * but we don't want any non zero values here hitting the
++     * frame rate network.  TODO: better design */
++    RNN_CLEAR(&features[18], 18); 
+ 
+     if (lpcnet->ftest) {
+         float pitch_f = pitch;
+@@ -220,7 +255,7 @@ void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features
+         RNN_MOVE(&lpcnet->last_sig[1], &lpcnet->last_sig[0], LPC_ORDER-1);
+         lpcnet->last_sig[0] = pcm;
+         lpcnet->last_exc = exc;
+-        pcm += PREEMPH*lpcnet->deemph_mem;
++        pcm += lpcnet->preemph*lpcnet->deemph_mem;
+         lpcnet->deemph_mem = pcm;
+         if (pcm<-32767) pcm = -32767;
+         if (pcm>32767) pcm = 32767;
+diff --git a/src/lpcnet.h b/src/lpcnet.h
+index 70e849e..bd98a37 100644
+--- a/src/lpcnet.h
++++ b/src/lpcnet.h
+@@ -34,8 +34,10 @@
+ typedef struct LPCNetState LPCNetState;
+ LPCNetState *lpcnet_create();
+ void lpcnet_destroy(LPCNetState *lpcnet);
+-void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features, int N, int logmag);
++void lpcnet_synthesize(LPCNetState *lpcnet, short *output, float *features, int N, int logmag);
+ 
+ void lpcnet_open_test_file(LPCNetState *lpcnet, char file_name[]);
++void lpcnet_set_preemph(LPCNetState *lpcnet, float preemph);
++void lpcnet_set_pitch_embedding(LPCNetState *lpcnet, int val);
+ 
+ #endif
+diff --git a/src/lpcnet.py b/src/lpcnet.py
+index 010f478..960e8c8 100644
+--- a/src/lpcnet.py
++++ b/src/lpcnet.py
+@@ -36,7 +36,6 @@ import numpy as np
+ import h5py
+ import sys
+ 
+-frame_size = 160
+ pcm_bits = 8
+ embed_size = 128
+ pcm_levels = 2**pcm_bits
+@@ -113,7 +112,7 @@ class PCMInit(Initializer):
+             'seed': self.seed
+         }
+ 
+-def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, training=False, use_gpu=True):
++def new_lpcnet_model(frame_size = 160, rnn_units1=384, rnn_units2=16, nb_used_features = 38, training=False, use_gpu=True):
+     pcm = Input(shape=(None, 3))
+     feat = Input(shape=(None, nb_used_features))
+     pitch = Input(shape=(None, 1))
+diff --git a/src/lpcnet_dump.c b/src/lpcnet_dump.c
+index d8a8409..58f9c98 100644
+--- a/src/lpcnet_dump.c
++++ b/src/lpcnet_dump.c
+@@ -87,7 +87,8 @@ static DenoiseState *rnnoise_create() {
+ }
+ 
+ static void rnnoise_destroy(DenoiseState *st) {
+-  free(st);
++    freq_close();
++    free(st);
+ }
+ 
+ static short float2short(float x)
+diff --git a/src/lpcnet_freedv.c b/src/lpcnet_freedv.c
+index 823fcdc..fe154ea 100644
+--- a/src/lpcnet_freedv.c
++++ b/src/lpcnet_freedv.c
+@@ -80,3 +80,9 @@ void lpcnet_dec(LPCNetFreeDV *lf, char *frame, short* pcm)
+ 
+ int lpcnet_samples_per_frame(LPCNetFreeDV *lf) { return FRAME_SIZE*lf->q->dec; } 
+ int lpcnet_bits_per_frame(LPCNetFreeDV *lf) { return lf->q->bits_per_frame; } 
++
++static char git_hash[] = GIT_HASH;
++char *lpcnet_get_hash(void) {
++    return git_hash;
++}
++
+diff --git a/src/lpcnet_freedv.h b/src/lpcnet_freedv.h
+index 43c8298..874f7cc 100644
+--- a/src/lpcnet_freedv.h
++++ b/src/lpcnet_freedv.h
+@@ -8,6 +8,10 @@
+ #ifndef __LPCNET_FREEDV__
+ #define __LPCNET_FREEDV__
+ 
++#ifdef __cplusplus
++  extern "C" {
++#endif
++
+ typedef struct LPCNetFreeDV LPCNetFreeDV;
+ 
+ LPCNetFreeDV* lpcnet_freedv_create(int direct_split);
+@@ -16,5 +20,10 @@ void lpcnet_enc(LPCNetFreeDV *lf, short *pcm, char *frame);
+ void lpcnet_dec(LPCNetFreeDV *lf, char *frame, short* pcm);
+ int lpcnet_bits_per_frame(LPCNetFreeDV *lf);
+ int lpcnet_samples_per_frame(LPCNetFreeDV *lf);
++char *lpcnet_get_hash(void);
++
++#ifdef __cplusplus
++}
++#endif
+ 
+ #endif
+diff --git a/src/nnet.c b/src/nnet.c
+index 8ad4a26..1da7d70 100644
+--- a/src/nnet.c
++++ b/src/nnet.c
+@@ -43,7 +43,9 @@
+ 
+ #ifdef __AVX__
+ #include "vec_avx.h"
+-#elif __ARM_NEON__
++#elif __SSE__
++#include "vec_sse.h"
++#elif __ARM_NEON__ || __aarch64__
+ #include "vec_neon.h"
+ #else
+ #warning Compiling without any vectorization. This code will be very slow
+diff --git a/src/plot_lpc.m b/src/plot_lpc.m
+new file mode 100644
+index 0000000..3b814be
+--- /dev/null
++++ b/src/plot_lpc.m
+@@ -0,0 +1,50 @@
++% plot_lpc.m
++% David Rowe April 2020
++%
++% Visualise LPC spectra for 700C decoder experiments
++
++Fs  = 8000;       % speech sample rate
++Fsf = 100;        % frame sample rate
++nb_features = 55;
++nb_rateK = 18;    % number of rateK (log amplitude) features
++nb_lpc = 10;      % number of LPCs
++
++function plot_against_time(v, st_sec, en_sec, Fs, leg='b')
++  st = Fs*st_sec; en = Fs*en_sec;
++  t = st_sec:1/Fs:en_sec;
++  plot(t,v(st+1:en+1),leg);
++endfunction
++
++function mesh_against_time(m, st_sec, en_sec, Fs)
++  st = Fs*st_sec; en = Fs*en_sec;
++  t = st_sec:1/Fs:en_sec;
++  mesh(m(st+1:en+1,:));  
++endfunction
++
++function mesh_aks_against_time(aks, st_sec, en_sec, Fs)
++  st = Fs*st_sec; en = Fs*en_sec;
++  t = st_sec:1/Fs:en_sec;
++  aks = aks(st+1:en+1,:); A = [];
++  for f=1:length(aks)
++    A = [A freqz(1,[1 aks(f,:)],64)];
++  end
++  AdB = 20*log10(abs(A));
++  max(AdB(:))
++  mesh(AdB);  
++endfunction
++
++# plots of speech (input), rateK vectors, LPC spectra
++
++features=load_f32("../build_linux/all_8k.f32", nb_features);
++rateK=features(:, 1:nb_rateK);
++aks = features(:, nb_rateK+1:nb_rateK+nb_lpc);
++fs=fopen("../build_linux/all_8k_10ms.sw","rb");
++s = fread(fs,Inf,"short");
++fclose(fs);
++
++st_sec=14; en_sec=16;
++
++figure(1); clf; plot_against_time(s, st_sec, en_sec, Fs, 'b')
++figure(2); clf; mesh_against_time(rateK, st_sec, en_sec, Fsf);
++figure(3); clf; mesh_aks_against_time(aks, st_sec, en_sec, Fsf);
++
+diff --git a/src/plot_pulaw.py b/src/plot_pulaw.py
+new file mode 100755
+index 0000000..10d5656
+--- /dev/null
++++ b/src/plot_pulaw.py
+@@ -0,0 +1,52 @@
++#!/usr/bin/python3
++# Utility to inspect packed ulaw samples from sw2packedulaw.c (or dump_data.c) before training 
++
++import numpy as np
++import matplotlib.pyplot as plt
++import sys
++import ulaw
++import argparse
++
++parser = argparse.ArgumentParser(description='Plot LPCNet training packed ulaw samples')
++parser.add_argument('file1', help='pulaw file of packed ulaw samples')
++parser.add_argument('--file2', help='optional second packed ulaw file to compare')
++parser.add_argument('--nb_samples', type=int, default=-1, help='Optional number of samples to plot')
++args = parser.parse_args()
++
++data = np.fromfile(args.file1, dtype='uint8')
++nb_samples = args.nb_samples
++data = data[:nb_samples]
++
++sig = np.array(data[0::4], dtype='float')
++pred = np.array(data[1::4], dtype='float')
++in_exc = np.array(data[2::4], dtype='float')
++out_exc = np.array(data[3::4], dtype='float')
++   
++print("exc var: %4.3e" % (np.var(ulaw.ulaw2lin(in_exc))))
++
++plt.figure(1)
++plt.subplot(211)
++plt.plot(ulaw.ulaw2lin(sig), label='sig')
++plt.ylim((-30000,30000))
++plt.legend()
++plt.subplot(212)
++plt.plot(ulaw.ulaw2lin(pred), label='pred')
++plt.ylim((-30000,30000))
++plt.legend()
++plt.show(block=False)
++
++plt.figure(2)
++plt.subplot(211)
++plt.plot(ulaw.ulaw2lin(in_exc), label='in_exc')
++if args.file2:
++    data2 = np.fromfile(args.file2, dtype='uint8')
++    data2 = data2[:nb_samples]
++    in_exc2 = np.array(data2[2::4], dtype='float')
++    plt.plot(ulaw.ulaw2lin(in_exc2), label='in_exc2')
++plt.ylim((-30000,30000))
++plt.legend()
++plt.subplot(212)
++plt.plot(ulaw.ulaw2lin(out_exc), label='out_exc')
++plt.ylim((-30000,30000))
++plt.legend()
++plt.show()
+diff --git a/src/plot_train.py b/src/plot_train.py
+index 910d7e9..7e2bc7b 100644
+--- a/src/plot_train.py
++++ b/src/plot_train.py
+@@ -3,11 +3,10 @@ import numpy as np
+ import sys
+ 
+ loss = np.loadtxt(sys.argv[1])
+-delta_loss = (loss[1:,0]-loss[:-1,0])/loss[1:,0]
++delta_loss = (loss[1:]-loss[:-1])/loss[1:]
+ 
+ plt.figure(1)
+-plt.plot(loss[:,0],'r')
+-plt.plot(loss[:,1],'g')
++plt.plot(loss[:],'r')
+ plt.title('loss')
+ plt.show(block=False)
+ plt.figure(2)
+diff --git a/src/plot_train.sh b/src/plot_train.sh
+index 2a1fddf..3c86094 100755
+--- a/src/plot_train.sh
++++ b/src/plot_train.sh
+@@ -6,5 +6,5 @@
+ # plot graphs of loss and spares categorical accuracy to get a feel
+ # for progress while training
+ 
+-grep loss $1 | sed -n 's/.*===\].*loss: \(.*\) - val_loss: \(.*\)/\1 \2/p' > loss.txt
+-python3 plot_train.py loss.txt
++grep loss $1 | sed -n 's/.*===\].*step - loss: \(.*\)/\1/p' > loss.txt
++python3 ~/LPCNet/src/plot_train.py loss.txt
+diff --git a/process.sh b/src/process.sh
+similarity index 100%
+rename from process.sh
+rename to src/process.sh
+diff --git a/src/sw2packedulaw.c b/src/sw2packedulaw.c
+new file mode 100644
+index 0000000..7724158
+--- /dev/null
++++ b/src/sw2packedulaw.c
+@@ -0,0 +1,188 @@
++/*
++  sw2packedulaw.c
++
++  Convert signed word samples to packed ulaw samples to drive LPCNet
++  training, this code is a cut/paste from dump_data.c witha few other
++  options.
++
++  By varying the LPC predictor coefficients we can try no predictor,
++  first order, and regular LPC.
++
++  1. No prediction (WaveRNN I guess):
++    $ ~/codec2/build_linux/src/c2sim ~/Downloads/all_8k.sw --ten_ms_centre all_8k_10ms.sw --rateKWov all_8k.f32 
++    $ ./src/sw2packedulaw --frame_size 80 all_8k_10ms.sw all_8k.f32 all_8k_none.pulaw
++    $ ../src/plot_pulaw.py all_8k_none.pulaw
++
++  2. First order predictor:
++    $ ~/codec2/build_linux/src/c2sim ~/Downloads/all_8k.sw --ten_ms_centre all_8k_10ms.sw --rateKWov all_8k.f32 --first
++    $ ./src/sw2packedulaw --frame_size 80 all_8k_10ms.sw all_8k.f32 all_8k_first.pulaw
++
++  3. LPC with ulaw Q in the loop and noise injection (standard LPCNet design):
++    $ ~/codec2/build_linux/src/c2sim ~/Downloads/all_8k.sw --ten_ms_centre all_8k_10ms.sw --rateKWov all_8k.f32 --lpc 10
++    $ ./src/sw2packedulaw --frame_size 80all_8k_10ms.sw  all_8k.f32 all_8k.pulaw
++
++  4. LPC with no Q in the loop or noise injection (linear):
++    $ ./src/sw2packedulaw --frame_size 80 --linear all_8k_10ms.sw all_8k.f32 all_8k_linear.pulaw
++
++  See plot_pulaw.py to inspect output .pulaw files
++*/
++
++#include <stdlib.h>
++#include <stdio.h>
++#include "common.h"
++#include <math.h>
++#include "freq.h"
++#include "pitch.h"
++#include "arch.h"
++#include "celt_lpc.h"
++#include <assert.h>
++#include <getopt.h>
++
++#define NB_FEATURES 55
++#define CODEC2_LPC_ORDER 10
++
++typedef struct {
++  float lpc[LPC_ORDER];
++  float sig_mem[LPC_ORDER];
++  int exc_mem;
++} DenoiseState;
++
++void write_audio(DenoiseState *st, const short *pcm, float noise_std, FILE *file, int frame_size) {
++  int i;
++  unsigned char data[4*frame_size];
++  for (i=0;i<frame_size;i++) {
++    int noise;
++    float p=0;
++    float e;
++    int j;
++    for (j=0;j<LPC_ORDER;j++) p -= st->lpc[j]*st->sig_mem[j];
++    e = lin2ulaw(pcm[i] - p);
++    /* Signal. */
++    data[4*i] = lin2ulaw(st->sig_mem[0]);
++    /* Prediction. */
++    data[4*i+1] = lin2ulaw(p);
++    /* Excitation in. */
++    data[4*i+2] = st->exc_mem;
++    /* Excitation out. */
++    data[4*i+3] = e;
++    /* Simulate error on excitation. */
++    noise = (int)floor(.5 + noise_std*.707*(log_approx((float)rand()/RAND_MAX)-log_approx((float)rand()/RAND_MAX)));
++    e += noise;
++    e = IMIN(255, IMAX(0, e));
++    
++    RNN_MOVE(&st->sig_mem[1], &st->sig_mem[0], LPC_ORDER-1);
++    st->sig_mem[0] = p + ulaw2lin(e);
++    st->exc_mem = e;
++  }
++  fwrite(data, 4*frame_size, 1, file);
++}
++
++/* takes ulaw out of predictor path, and no noise injection */
++void write_audio_linear(DenoiseState *st, const short *pcm, FILE *file, int frame_size) {
++  int i;
++  unsigned char data[4*frame_size];
++  for (i=0;i<frame_size;i++) {
++    float p=0;
++    float e;
++    int j;
++    for (j=0;j<LPC_ORDER;j++) p -= st->lpc[j]*st->sig_mem[j];
++    e = pcm[i] - p;
++    //fprintf(stderr,"pcm: %d p: %f e: %f\n", pcm[i], p, e);
++    /* Signal. */
++    data[4*i] = lin2ulaw(st->sig_mem[0]);
++    /* Prediction. */
++    data[4*i+1] = lin2ulaw(p);
++    /* Excitation in. */
++    data[4*i+2] = st->exc_mem;
++    /* Excitation out. */
++    data[4*i+3] = lin2ulaw(e);
++
++    RNN_MOVE(&st->sig_mem[1], &st->sig_mem[0], LPC_ORDER-1);
++    st->sig_mem[0] = pcm[i];
++    st->exc_mem = lin2ulaw(e);
++  }
++  fwrite(data, 4*frame_size, 1, file);
++}
++
++int main(int argc, char *argv[]) {
++    int linear = 0;
++    int frame_size = FRAME_SIZE;
++    
++    DenoiseState st;
++    memset(&st, 0, sizeof(DenoiseState));
++    st.exc_mem = 128;
++    
++    int o = 0;
++    int opt_idx = 0;
++    while( o != -1 ) {
++        static struct option long_opts[] = {
++            {"linear", no_argument, 0, 'l'},
++            {"frame_size", required_argument, 0, 'f'},
++            {0, 0, 0, 0}
++        };
++        
++	o = getopt_long(argc,argv,"l",long_opts,&opt_idx);
++        
++	switch(o){
++	case 'f':
++	    frame_size = atoi(optarg);
++	    fprintf(stderr, "frame_size: %d\n", frame_size);
++	    break;
++	case 'l':
++	    linear = 1;
++	    break;
++	case '?':
++	    goto helpmsg;
++	    break;
++	}
++    }
++    int dx = optind;
++
++    if ((argc - dx) < 3) {
++    helpmsg:
++        fprintf(stderr, "usage: s2packedulaw Input.s16 FeatureFile.f32 Output.pulaw\n");
++        return 0;
++    }
++
++    FILE *fsw = fopen(argv[dx], "rb");
++    if (fsw == NULL) {
++	fprintf(stderr, "Can't open %s\n", argv[dx]);
++	exit(1);
++    }
++    
++    FILE *ffeature = fopen(argv[dx+1], "rb");
++    if (ffeature == NULL) {
++	fprintf(stderr, "Can't open %s\n", argv[dx+1]);
++	exit(1);
++    }
++    
++    FILE *fpackedpcm = fopen(argv[dx+2], "wb");
++    if (fpackedpcm == NULL) {
++	fprintf(stderr, "Can't open %s\n", argv[dx+2]);
++	exit(1);
++    }
++    
++    short frame[frame_size];
++    while (fread(frame, sizeof(short), frame_size, fsw) == (unsigned)frame_size) {
++	float features[NB_FEATURES];
++	int ret = fread(features, sizeof(float), NB_FEATURES, ffeature);
++	if (ret != NB_FEATURES) {
++	    fprintf(stderr, "feature file ended early!\n");
++	    exit(1);		
++	}
++	for(int i=0; i<CODEC2_LPC_ORDER; i++) {
++	    st.lpc[i] = features[18+i];
++	}
++	if (linear)
++	    write_audio_linear(&st, frame, fpackedpcm, frame_size);
++	else {
++	    write_audio(&st, frame, 0.5, fpackedpcm, frame_size);
++	}
++    }
++
++    fclose(fsw);
++    fclose(ffeature);
++    fclose(fpackedpcm);
++    return 0;
++}
++
+diff --git a/src/test_lpcnet.c b/src/test_lpcnet.c
+index 0a34729..e8c9907 100644
+--- a/src/test_lpcnet.c
++++ b/src/test_lpcnet.c
+@@ -36,26 +36,37 @@
+ int main(int argc, char **argv) {
+     FILE *fin, *fout;
+     LPCNetState *net;
+-    int logmag = 0;
+-
++    int mag = 0;
++    int frame_size = FRAME_SIZE;
++    
+     net = lpcnet_create();
+     
+     int o = 0;
+     int opt_idx = 0;
+     while( o != -1 ) {
+         static struct option long_opts[] = {
+-            {"mag", no_argument, 0, 'i'},
+-            {"nnet", required_argument, 0, 'n'},
++            {"frame_size", required_argument, 0, 'f'},
+             {"logstates", required_argument, 0, 'l'},
+-            {0, 0, 0, 0}
++            {"mag", required_argument, 0, 'i'},
++            {"nnet", required_argument, 0, 'n'},
++            {"no_pitch_embedding", no_argument, 0, 'e'},
++            {"pre", required_argument, 0, 'p'},
++           {0, 0, 0, 0}
+         };
+         
+ 	o = getopt_long(argc,argv,"ihn:l:",long_opts,&opt_idx);
+         
+ 	switch(o){
++	case 'e':
++	    lpcnet_set_pitch_embedding(net, 0);
++	    break;
++	case 'f':
++	    frame_size = atoi(optarg);
++	    fprintf(stderr, "frame_size: %d\n", frame_size);
++	    break;
+ 	case 'i':
+-	    logmag = 1;
+-	    fprintf(stderr, "logmag: %d\n", logmag);
++	    mag = atoi(optarg);
++	    fprintf(stderr, "mag: %d\n", mag);
+ 	    break;
+ 	case 'l':
+ 	    fprintf(stderr, "logstates file: %s\n", optarg);
+@@ -65,6 +76,10 @@ int main(int argc, char **argv) {
+ 	    fprintf(stderr, "loading nnet: %s\n", optarg);
+ 	    nnet_read(optarg);
+ 	    break;
++	case 'p':
++	    if (atoi(optarg) == 0)
++		lpcnet_set_preemph(net, 0.0);
++	    break;
+ 	case '?':
+ 	    goto helpmsg;
+ 	    break;
+@@ -74,7 +89,9 @@ int main(int argc, char **argv) {
+ 
+     if ((argc - dx) < 2) {
+     helpmsg:
+-        fprintf(stderr, "usage: test_lpcnet [--mag] [--logstates statesfile] [--nnet lpcnet_xxx.f32] <features.f32> <output.pcm>\n");
++        fprintf(stderr, "usage: test_lpcnet [--mag 1|2] [--logstates statesfile] [--nnet lpcnet_xxx.f32]"
++		" [--framesize samples] [--pre 0|1] <features.f32> <output.s16>\n");
++	fprintf(stderr, "--mag -i 0-cepstrals, 1-logmag, 2-disable LPC (WaveRNN)\n");
+         return 0;
+     }
+ 
+@@ -99,13 +116,12 @@ int main(int argc, char **argv) {
+     while (1) {
+         float in_features[NB_TOTAL_FEATURES];
+         float features[NB_FEATURES];
+-        short pcm[FRAME_SIZE];
++        short pcm[frame_size];
+         int nread = fread(in_features, sizeof(features[0]), NB_TOTAL_FEATURES, fin);
+         if (nread != NB_TOTAL_FEATURES) break;
+         RNN_COPY(features, in_features, NB_FEATURES);
+-        RNN_CLEAR(&features[18], 18);
+-        lpcnet_synthesize(net, pcm, features, FRAME_SIZE, logmag);
+-        fwrite(pcm, sizeof(pcm[0]), FRAME_SIZE, fout);
++        lpcnet_synthesize(net, pcm, features, frame_size, mag);
++        fwrite(pcm, sizeof(pcm[0]), frame_size, fout);
+         if (fout == stdout) fflush(stdout);
+     }
+     fclose(fin);
+diff --git a/src/test_vec.c b/src/test_vec.c
+index 09b51e7..efa617e 100644
+--- a/src/test_vec.c
++++ b/src/test_vec.c
+@@ -26,7 +26,10 @@ const char simd[]="AVX2";
+ #else
+ const char simd[]="AVX";
+ #endif
+-#elif __ARM_NEON__
++#elif __SSE__
++#include "vec_sse.h"
++const char simd[]="SSE";
++#elif __ARM_NEON__ || __aarch64__
+ #include "vec_neon.h"
+ const char simd[]="NEON";
+ #else
+diff --git a/src/thash.c b/src/thash.c
+new file mode 100644
+index 0000000..5b60f2e
+--- /dev/null
++++ b/src/thash.c
+@@ -0,0 +1,19 @@
++/*---------------------------------------------------------------------------*\
++
++  FILE........: thash.c
++  AUTHOR......: David Rowe
++  DATE CREATED: July 2020
++
++  Simple test program for LPCNet API get hash function
++
++\*---------------------------------------------------------------------------*/
++
++#include <stdio.h>
++#include "lpcnet_freedv.h"
++
++int main(void) { 
++    printf("%s\n", lpcnet_get_hash());
++    return 0;
++}
++
++
+diff --git a/train_direct.sh b/src/train_direct.sh
+similarity index 100%
+rename from train_direct.sh
+rename to src/train_direct.sh
+diff --git a/src/train_lpcnet.py b/src/train_lpcnet.py
+index 62abbd7..94ab9a8 100755
+--- a/src/train_lpcnet.py
++++ b/src/train_lpcnet.py
+@@ -35,9 +35,14 @@ from keras.callbacks import ModelCheckpoint
+ from ulaw import ulaw2lin, lin2ulaw
+ import keras.backend as K
+ import h5py
+-
++import argparse
++import os
+ import tensorflow as tf
+ from keras.backend.tensorflow_backend import set_session
++import matplotlib.pyplot as plt
++
++# less verbose tensorflow ....
++os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+ config = tf.ConfigProto()
+ 
+ # use this option to reserve GPU memory, e.g. for running more than
+@@ -46,23 +51,38 @@ config = tf.ConfigProto()
+ 
+ set_session(tf.Session(config=config))
+ 
+-nb_epochs = 10
+-
+ # Try reducing batch_size if you run out of memory on your GPU
+ batch_size = 32
++# with of feature records used for training
++nb_features = 55
++
++parser = argparse.ArgumentParser(description='LPCNet training')
++parser.add_argument('feature_file', help='.f32 file of float features')
++parser.add_argument('packed_ulaw_file', help='file of 4 multiplexed ulaw samples per speech sample')
++parser.add_argument('prefix', help='.h5 file prefix to easily identify each experiment')
++parser.add_argument('--frame_size', type=int, default=160, help='frames size in samples')
++parser.add_argument('--epochs', type=int, default=20, help='Number of training epochs')
++parser.add_argument('--no_pitch_embedding', action='store_true', help='disable pitch embedding')
++parser.add_argument('--load_h5', help='disable pitch embedding')
++args = parser.parse_args()
+ 
+-model, _, _ = lpcnet.new_lpcnet_model(training=True)
++nb_epochs = args.epochs
++
++model, _, _ = lpcnet.new_lpcnet_model(frame_size=args.frame_size, training=True)
+ 
+ model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
+ model.summary()
+ 
+-feature_file = sys.argv[1]
+-pcm_file = sys.argv[2]            # 16 bit unsigned short PCM samples
+-prefix = sys.argv[3]              # prefix to put on .h5 files to easily name each experiment
++if args.load_h5:
++    print("loading: %s" % (args.load_h5))
++    model.load_weights(args.load_h5)
++
++feature_file = args.feature_file
++pcm_file = args.packed_ulaw_file           
++prefix = args.prefix              
+ frame_size = model.frame_size
+-nb_features = 55
+ nb_used_features = model.nb_used_features
+-feature_chunk_size = 15
++feature_chunk_size = 15 # time window for conv1d/receptive field
+ pcm_chunk_size = frame_size*feature_chunk_size
+ 
+ # u for unquantised, load 16 bit PCM samples and convert to mu-law
+@@ -84,7 +104,17 @@ in_exc = np.reshape(data[2::4], (nb_frames, pcm_chunk_size, 1))
+ out_exc = np.reshape(data[3::4], (nb_frames, pcm_chunk_size, 1))
+ del data
+ 
+-print("ulaw std = ", np.std(out_exc))
++"""
++# plot ulaw signals to sanity check
++testf=10
++print(sig.shape)
++#plt.plot(sig[testf,:],label="sig")
++#plt.plot(pred[testf,:],label="pred")
++plt.plot(in_exc[testf,:],label="in_exc")
++plt.plot(out_exc[testf,:],label="out_exc")
++plt.legend()
++plt.show()
++"""
+ 
+ features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))
+ features = features[:, :, :nb_used_features]
+@@ -93,12 +123,34 @@ features = features[:, :, :nb_used_features]
+ # nb_used_features=38, so 0...37, so lpc-gain not used
+ features[:,:,18:36] = 0   # zero out 18..35, so pitch and pitch gain being fed in, lpc gain ignored
+ 
++"""
++# plot features to sanity check
++print(features.shape)
++testf=10
++plt.plot(features[testf,:,37:38])
++plt.show()
++"""
++
+ fpad1 = np.concatenate([features[0:1, 0:2, :], features[:-1, -2:, :]], axis=0)
+ fpad2 = np.concatenate([features[1:, :2, :], features[0:1, -2:, :]], axis=0)
+ features = np.concatenate([fpad1, features, fpad2], axis=1)
+ 
+-# pitch feature uses as well as cesptrals
++# pitch feature uses as well as cepstrals
+ periods = (.1 + 50*features[:,:,36:37]+100).astype('int16')
++print(periods.shape)
++if args.no_pitch_embedding:
++    print("no_pitch_embedding")
++    periods[:] = 0
++# sanity check training data aginst pitch embedding range
++assert np.all(periods >= 40), "pitch embedding < 40"
++assert np.all(periods < 256), "pitch embeddeding > 255"
++
++"""
++# plot pitch to sanity check
++print(features.shape, periods.shape)
++plt.plot(periods.reshape(-1)[:1000])
++plt.show()
++"""
+ 
+ in_data = np.concatenate([sig, pred, in_exc], axis=-1)
+ 
+@@ -108,9 +160,8 @@ del in_exc
+ 
+ # dump models to disk as we go
+ #checkpoint = ModelCheckpoint('lpcnet20h_384_10_G16_{epoch:02d}.h5')
+-checkpoint = ModelCheckpoint(prefix + '_{epoch:02d}.h5')
++checkpoint = ModelCheckpoint(prefix + '_{epoch:d}.h5')
+ 
+ # use this to reload a partially trained model
+-#model.load_weights('lpcnet_190203_07.h5')
+ model.compile(optimizer=Adam(0.001, amsgrad=True, decay=5e-5), loss='sparse_categorical_crossentropy')
+-model.fit([in_data, features, periods], out_exc, batch_size=batch_size, epochs=nb_epochs, validation_split=0.1, callbacks=[checkpoint, lpcnet.Sparsify(2000, 40000, 400, (0.05, 0.05, 0.2))])
++model.fit([in_data, features, periods], out_exc, batch_size=batch_size, epochs=nb_epochs, callbacks=[checkpoint, lpcnet.Sparsify(2000, 40000, 400, (0.05, 0.05, 0.2))])
+diff --git a/train_pred2.sh b/src/train_pred2.sh
+similarity index 100%
+rename from train_pred2.sh
+rename to src/train_pred2.sh
+diff --git a/src/vec_avx.h b/src/vec_avx.h
+index 1e58f8d..520b5b2 100644
+--- a/src/vec_avx.h
++++ b/src/vec_avx.h
+@@ -79,7 +79,7 @@ static __m128 exp4_approx(__m128 X)
+    Y = _mm_castsi128_ps(_mm_and_si128(mask, _mm_add_epi32(I, _mm_castps_si128(Y))));
+    return Y;
+ }
+-static __m256 exp8_approx(__m256 X)
++static inline __m256 exp8_approx(__m256 X)
+ {
+    __m256 Y;
+    __m128 Xhi, Xlo, Yhi, Ylo;
+diff --git a/src/vec_sse.h b/src/vec_sse.h
+new file mode 100644
+index 0000000..82ddd42
+--- /dev/null
++++ b/src/vec_sse.h
+@@ -0,0 +1,211 @@
++/* Copyright (c) 2020 SASANO Takayoshi
++                 2018 David Rowe
++                 2018 Mozilla
++                 2008-2011 Octasic Inc.
++                 2012-2017 Jean-Marc Valin */
++/*
++   Redistribution and use in source and binary forms, with or without
++   modification, are permitted provided that the following conditions
++   are met:
++
++   - Redistributions of source code must retain the above copyright
++   notice, this list of conditions and the following disclaimer.
++
++   - Redistributions in binary form must reproduce the above copyright
++   notice, this list of conditions and the following disclaimer in the
++   documentation and/or other materials provided with the distribution.
++
++   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
++   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
++   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
++   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++/*
++  SSE implementation of vector operations, compile with -msse
++  port from Arm NEON support
++*/
++
++#include <xmmintrin.h>
++
++#ifndef LPCNET_TEST
++static float celt_exp2(float x)
++{
++    int integer;
++    float frac;
++    union {
++	float f;
++	opus_uint32 i;
++    } res;
++    integer = floor(x);
++    if (integer < -50)
++	return 0;
++    frac = x-integer;
++    /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
++    res.f = 0.99992522f + frac * (0.69583354f
++				  + frac * (0.22606716f + 0.078024523f*frac));
++    res.i = (res.i + (integer<<23)) & 0x7fffffff;
++    return res.f;
++}
++#define celt_exp_sse(x) celt_exp2((x)*1.44269504f)
++
++static float tansig_approx(float x)
++{
++    int i;
++    float y, dy;
++    float sign=1;
++    /* Tests are reversed to catch NaNs */
++    if (!(x<8))
++        return 1;
++    if (!(x>-8))
++        return -1;
++#ifndef FIXED_POINT
++    /* Another check in case of -ffast-math */
++    if (celt_isnan(x))
++	return 0;
++#endif
++    if (x<0)
++    {
++	x=-x;
++	sign=-1;
++    }
++    i = (int)floor(.5f+25*x);
++    x -= .04f*i;
++    y = tansig_table[i];
++    dy = 1-y*y;
++    y = y + x*dy*(1 - y*x);
++    return sign*y;
++}
++
++static OPUS_INLINE float sigmoid_approx(float x)
++{
++    return .5f + .5f*tansig_approx(.5f*x);
++}
++
++static void softmax(float *y, const float *x, int N)
++{
++    int i;
++    for (i=0;i<N;i++)
++        y[i] = celt_exp_sse(x[i]);
++}
++
++static void vec_tanh(float *y, const float *x, int N)
++{
++    int i;
++    for (i=0;i<N;i++)
++    {
++        y[i] = tansig_approx(x[i]);
++    }
++}
++
++static void vec_sigmoid(float *y, const float *x, int N)
++{
++    int i;
++    for (i=0;i<N;i++)
++    {
++        y[i] = sigmoid_approx(x[i]);
++    }
++}
++#endif
++
++static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
++{
++    int i, j;
++    for (i=0;i<rows;i+=16)
++    {
++	float * restrict y = &out[i];
++      
++	/* keep y[0..15] in registers for duration of inner loop */
++      
++	__m128 y0_3 = _mm_loadu_ps(&y[0]);
++	__m128 y4_7 = _mm_loadu_ps(&y[4]);
++	__m128 y8_11 = _mm_loadu_ps(&y[8]);
++	__m128 y12_15 = _mm_loadu_ps(&y[12]);
++      
++	for (j=0;j<cols;j++)
++	{
++	    const float * restrict w;
++	    __m128 wvec0_3, wvec4_7, wvec8_11, wvec12_15;
++	    __m128 xj = _mm_set1_ps(x[j]);
++
++	    w = &weights[j*col_stride + i];
++
++	    wvec0_3 = _mm_loadu_ps(&w[0]);
++	    wvec4_7 = _mm_loadu_ps(&w[4]);
++	    wvec8_11 = _mm_loadu_ps(&w[8]);
++	    wvec12_15 = _mm_loadu_ps(&w[12]);
++
++	    wvec0_3 = _mm_mul_ps(wvec0_3, xj);
++	    wvec4_7 = _mm_mul_ps(wvec4_7, xj);
++	    wvec8_11 = _mm_mul_ps(wvec8_11, xj);
++	    wvec12_15 = _mm_mul_ps(wvec12_15, xj);
++
++	    y0_3 = _mm_add_ps(y0_3, wvec0_3);
++	    y4_7 = _mm_add_ps(y4_7, wvec4_7);
++	    y8_11 = _mm_add_ps(y8_11, wvec8_11);
++	    y12_15 = _mm_add_ps(y12_15, wvec12_15);
++	}
++
++	/* save y[0..15] back to memory */
++      
++	_mm_storeu_ps(&y[0], y0_3);
++	_mm_storeu_ps(&y[4], y4_7);
++	_mm_storeu_ps(&y[8], y8_11);
++	_mm_storeu_ps(&y[12], y12_15);
++    }
++}
++
++static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
++{
++    int i, j;
++    for (i=0;i<rows;i+=16)
++    {
++	int cols;
++	cols = *idx++;
++	float * restrict y = &out[i];
++
++	/* keep y[0..15] in registers for duration of inner loop */
++      
++	__m128 y0_3 = _mm_loadu_ps(&y[0]);
++	__m128 y4_7 = _mm_loadu_ps(&y[4]);
++	__m128 y8_11 = _mm_loadu_ps(&y[8]);
++	__m128 y12_15 = _mm_loadu_ps(&y[12]);
++      
++	for (j=0;j<cols;j++)
++	{
++	    __m128 wvec;
++	    __m128 xj = _mm_set1_ps(x[*idx++]);
++
++	    wvec = _mm_loadu_ps(&w[0]);
++	    wvec = _mm_mul_ps(wvec, xj);
++	    y0_3 = _mm_add_ps(y0_3, wvec);
++
++	    wvec = _mm_loadu_ps(&w[4]);
++	    wvec = _mm_mul_ps(wvec, xj);
++	    y4_7 = _mm_add_ps(y4_7, wvec);
++
++	    wvec = _mm_loadu_ps(&w[8]);
++	    wvec = _mm_mul_ps(wvec, xj);
++	    y8_11 = _mm_add_ps(y8_11, wvec);
++
++	    wvec = _mm_loadu_ps(&w[12]);
++	    wvec = _mm_mul_ps(wvec, xj);
++	    y12_15 = _mm_add_ps(y12_15, wvec);
++
++	    w += 16;
++	}
++
++	/* save y[0..15] back to memory */
++      
++	_mm_storeu_ps(&y[0], y0_3);
++	_mm_storeu_ps(&y[4], y4_7);
++	_mm_storeu_ps(&y[8], y8_11);
++	_mm_storeu_ps(&y[12], y12_15);
++    }
++}
+diff --git a/train_pred1.sh b/train_pred1.sh
+deleted file mode 100755
+index 3694252..0000000
+--- a/train_pred1.sh
++++ /dev/null
+@@ -1,31 +0,0 @@
+-#!/bin/sh -x
+-# train_pred2.sh
+-# David Rowe Jan 2019
+-# Train multi-stage VQ for LPCNet
+-
+-PATH=$PATH:/home/david/codec2-dev/build_linux/misc/
+-
+-if [ $# -lt 1 ]; then
+-    echo "usage: ./train_pred1.sh [-w] VQprefix"
+-    echo "       $ ./train_pred1.sh pred1_v1"
+-    exit 1
+-fi
+-
+-VQ_NAME=$1
+-echo $VQ_NAME
+-
+-K=18
+-STOP=1E-2
+-
+-echo "*********"
+-echo "Pred 1"
+-echo "*********"
+-echo "weighting dctLy[0] ...."
+-t=$(mktemp)
+-extract all_speech_features.f32 $t 0 17 10 1.0 1
+-cat $t | ./weight > $VQ_NAME'_s0.f32'
+-vqtrain $VQ_NAME'_s0.f32' $K 2048 $VQ_NAME'_stage1.f32' -r $VQ_NAME'_s1.f32' -s $STOP 
+-vqtrain $VQ_NAME'_s1.f32' $K 2048 $VQ_NAME'_stage2.f32' -r $VQ_NAME'_s2.f32' -s $STOP
+-vqtrain $VQ_NAME'_s2.f32' $K 2048 $VQ_NAME'_stage3.f32' -r $VQ_NAME'_s3.f32' -s $STOP 
+-vqtrain $VQ_NAME'_s3.f32' $K 2048 $VQ_NAME'_stage4.f32' -r $VQ_NAME'_s4.f32' -s $STOP 
+-
+diff --git a/unittest/test_core_nn.sh b/unittest/test_core_nn.sh
+index 392c897..cd955c7 100755
+--- a/unittest/test_core_nn.sh
++++ b/unittest/test_core_nn.sh
+@@ -1,4 +1,4 @@
+-#!/bin/bash
++#!/bin/bash -x
+ # test_core_nn.sh
+ #
+ 
+@@ -60,7 +60,7 @@ if [ ! -z $SYNTH_mag ]; then
+     ../build_linux/src/dump_data --mag --test --c2pitch ../wav/c01_01.wav c01_01.f32
+     diff c01_01_mag.f32 c01_01.f32 || { echo "ERROR in synth .f32 output! Exiting..."; exit 1; }
+     echo "mag .f32 OK"
+-    ../build_linux/src/test_lpcnet --mag -n lpcnet_190804a.f32 c01_01.f32 c01_01_out.raw
++    ../build_linux/src/test_lpcnet --mag 1 -n lpcnet_190804a.f32 c01_01.f32 c01_01_out.raw
+     diff c01_01_190804a_targ.raw c01_01_out.raw || { echo "ERROR in synth .raw output! Exiting..."; exit 1; }
+     echo "mag .raw OK"
+ fi
diff --git a/lpcnetfreedv-vector-updates.patch b/lpcnetfreedv-vector-updates.patch
deleted file mode 100644
index 6000514..0000000
--- a/lpcnetfreedv-vector-updates.patch
+++ /dev/null
@@ -1,62 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 680f52c..e536f30 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -52,6 +52,9 @@ message(STATUS "LPCNet version: ${LPCNET_VERSION}")
- # Set default flags
- set(CMAKE_C_FLAGS "-Wall -W -Wextra -Wno-unused-function -O3 -g -I. -MD ${CMAKE_C_FLAGS} -DENABLE_ASSERTIONS")
- 
-+# Arch specific stuff here
-+message(STATUS "Host system arch is: ${CMAKE_SYSTEM_PROCESSOR}")
-+
- # Detection of available CPU optimizations
- if(NOT DISABLE_CPU_OPTIMIZATION)
-     if(UNIX AND NOT APPLE)
-@@ -87,7 +90,7 @@ elseif(${AVX} OR ${AVX} GREATER 0)
-     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
- endif()
- 
--# RPi
-+# RPi / ARM 32bit
- if(${NEON} OR ${NEON} GREATER 0)
-     message(STATUS "neon processor flags found or enabled.")
-     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=armv8-a -mtune=cortex-a53")
-diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
-index 41a78dc..6c49f5e 100644
---- a/src/CMakeLists.txt
-+++ b/src/CMakeLists.txt
-@@ -49,7 +49,7 @@ target_link_libraries(dump_data lpcnetfreedv m codec2)
- add_executable(test_lpcnet test_lpcnet.c)
- target_link_libraries(test_lpcnet lpcnetfreedv m codec2)
- 
--if(AVX OR AVX2)
-+if(AVX OR AVX2 OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
-     add_executable(test_vec test_vec.c)
-     target_link_libraries(test_vec m)
- else()
-diff --git a/src/nnet.c b/src/nnet.c
-index 8ad4a26..ccb9c94 100644
---- a/src/nnet.c
-+++ b/src/nnet.c
-@@ -43,7 +43,7 @@
- 
- #ifdef __AVX__
- #include "vec_avx.h"
--#elif __ARM_NEON__
-+#elif __ARM_NEON__ || __aarch64__
- #include "vec_neon.h"
- #else
- #warning Compiling without any vectorization. This code will be very slow
-diff --git a/src/test_vec.c b/src/test_vec.c
-index 09b51e7..254292b 100644
---- a/src/test_vec.c
-+++ b/src/test_vec.c
-@@ -26,7 +26,7 @@ const char simd[]="AVX2";
- #else
- const char simd[]="AVX";
- #endif
--#elif __ARM_NEON__
-+#elif __ARM_NEON__ || __aarch64__
- #include "vec_neon.h"
- const char simd[]="NEON";
- #else
diff --git a/lpcnetfreedv.spec b/lpcnetfreedv.spec
index 07b763b..6de645a 100644
--- a/lpcnetfreedv.spec
+++ b/lpcnetfreedv.spec
@@ -1,6 +1,9 @@
+%undefine __cmake_in_source_build
+%global sover 0.2
+
 Name:           lpcnetfreedv
 Version:        0.2
-Release:        4%{?dist}
+Release:        5%{?dist}
 Summary:        LPCNet for FreeDV
 
 License:        BSD
@@ -8,10 +11,7 @@ URL:            https://github.com/drowe67/LPCNet
 Source0:        https://github.com/drowe67/LPCNet/archive/v%{version}/LPCNet-%{version}.tar.gz
 Source1:        http://rowetel.com/downloads/deep/lpcnet_191005_v1.0.tgz
 
-# Fixes for aarch64 which has NEON instructions natively
-Patch0:         lpcnetfreedv-vector-updates.patch
-# Make library private for FreeDV
-Patch1:         lpcnetfreedv-private_libs.patch
+Patch0:         lpcnetfreedv-test.patch
 
 BuildRequires:  cmake gcc
 BuildRequires:  codec2-devel
@@ -37,7 +37,7 @@ Summary:        Development files and tools for LPCNet
 %build
 # Add model data archive to the build directory so CMake finds it.
 mkdir -p %{_vpath_builddir}
-cp %{SOURCE1} %{_vpath_builddir}/
+cp %{SOURCE1} %{__cmake_builddir}/
 
 # We need to force optimizations to specific values since the build system and
 # host system will likely be different.
@@ -63,15 +63,19 @@ cp %{SOURCE1} %{_vpath_builddir}/
 %files
 %license COPYING
 %doc README.md
-%{_libdir}/%{name}/lib%{name}.so
+%{_libdir}/lib%{name}.so.%{sover}
 
 %files devel
 %{_bindir}/*
 %{_includedir}/lpcnet/
 %{_libdir}/cmake/lpcnetfreedv/
+%{_libdir}/lib%{name}.so
 
 
 %changelog
+* Sun Dec 20 2020 Richard Shaw <hobbes1069@gmail.com> - 0.2-5
+- Change library install location to %%{_libdir}.
+
 * Sat Aug 01 2020 Fedora Release Engineering <releng@fedoraproject.org> - 0.2-4
 - Second attempt - Rebuilt for
   https://fedoraproject.org/wiki/Fedora_33_Mass_Rebuild