首页 > 解决方案 > Android 主线程 UI 在实现 Google Speech-to-text 时没有响应。怎么解决?

问题描述

目前,我正在我的项目中实现 google Speech to Text。引用的示例代码是这样的:单击此处

我已经使用了这个项目中的 SpeechService 和 Voice Recorder 类。

public class SpeechService extends Service {

public static final List<String> SCOPE =
        Collections.singletonList("https://www.googleapis.com/auth/cloud-platform");
private static final String TAG = "SpeechService";

private static final String PREFS = "SpeechService";
private static final String PREF_ACCESS_TOKEN_VALUE = "access_token_value";
private static final String PREF_ACCESS_TOKEN_EXPIRATION_TIME = "access_token_expiration_time";

/**
 * We reuse an access token if its expiration time is longer than this.
 */
private static final int ACCESS_TOKEN_EXPIRATION_TOLERANCE = 30 * 60 * 1000; // thirty minutes
/**
 * We refresh the current access token before it expires.
 */
private static final int ACCESS_TOKEN_FETCH_MARGIN = 60 * 1000; // one minute
private static final String HOSTNAME = "speech.googleapis.com";
private static final int PORT = 443;
private static Handler mHandler;
private final SpeechBinder mBinder = new SpeechBinder();
private final ArrayList<Listener> mListeners = new ArrayList<>();
private final StreamObserver<StreamingRecognizeResponse> mResponseObserver
        = new StreamObserver<StreamingRecognizeResponse>() {
    @Override
    public void onNext(StreamingRecognizeResponse response) {
        Log.e("Speech", "Recognized");
        String text = null;
        boolean isFinal = false;
        if (response.getResultsCount() > 0) {

            System.out.println("result count....."+String.valueOf(response.getResultsCount()));

            final StreamingRecognitionResult result = response.getResults(0);
            isFinal = result.getIsFinal();
            if (result.getAlternativesCount() > 0) {
                final SpeechRecognitionAlternative alternative = result.getAlternatives(0);
                text = alternative.getTranscript();
            }
        }
        if (text != null && isFinal) {

            for (Listener listener : mListeners) {
                listener.onSpeechRecognized(text, isFinal);
            }

        } else {

            for (Listener listener : mListeners) {
                listener.onRandomStupidity();
            }
        }

    }

    @Override
    public void onError(Throwable t) {
        Log.e(TAG, "Error calling the API.", t);
        for(Listener listener : mListeners){
            listener.onErrorRecognizing();
        }

    }

    @Override
    public void onCompleted() {
        Log.i(TAG, "API completed.");
    }

};

private volatile AccessTokenTask mAccessTokenTask;
private final Runnable mFetchAccessTokenRunnable = new Runnable() {
    @Override
    public void run() {
        fetchAccessToken();
    }
};
private SpeechGrpc.SpeechStub mApi;
private StreamObserver<StreamingRecognizeRequest> mRequestObserver;

public static SpeechService from(IBinder binder) {
    return ((SpeechBinder) binder).getService();
}

@Override
public void onCreate() {
    super.onCreate();
    mHandler = new Handler();
    fetchAccessToken();
}

@Override
public void onDestroy() {
    super.onDestroy();
    mHandler.removeCallbacks(mFetchAccessTokenRunnable);
    mHandler = null;
    // Release the gRPC channel.
    if (mApi != null) {
        final ManagedChannel channel = (ManagedChannel) mApi.getChannel();
        if (channel != null && !channel.isShutdown()) {
            try {
                channel.shutdown().awaitTermination(5, TimeUnit.SECONDS);
            } catch (InterruptedException e) {
                Log.e(TAG, "Error shutting down the gRPC channel.", e);
            }
        }
        mApi = null;
    }
}

private void fetchAccessToken() {
    if (mAccessTokenTask != null) {
        return;
    }
    mAccessTokenTask = new AccessTokenTask();
    mAccessTokenTask.execute();
}

private String getDefaultLanguageCode() {
    final LangInnerResponse languageToLearn = MemoryCache.getLanguageToLearn();

    if(languageToLearn != null) {
        Log.e("Test Lang", languageToLearn.getCode());
        return languageToLearn.getCode();
    } else {
        final Locale locale = Locale.getDefault();
        final StringBuilder language = new StringBuilder(locale.getLanguage());
        final String country = locale.getCountry();
        if (!TextUtils.isEmpty(country)) {
            language.append("-");
            language.append(country);
        }
        return language.toString();
    }
}

@Nullable
@Override
public IBinder onBind(Intent intent) {
    return mBinder;
}

public void addListener(@NonNull Listener listener) {
    mListeners.add(listener);
}

public void removeListener(@NonNull Listener listener) {
    mListeners.remove(listener);
}

/**
 ** Starts recognizing speech audio.
 *
 * @param sampleRate The sample rate of the audio.
 */
public void startRecognizing(int sampleRate) {
    if (mApi == null) {
        Log.w(TAG, "API not ready. Ignoring the request.");
        return;
    }
    System.out.println("calling api....");
    // Configure the API
    mRequestObserver = mApi.streamingRecognize(mResponseObserver);
    mRequestObserver.onNext(StreamingRecognizeRequest.newBuilder()
            .setStreamingConfig(StreamingRecognitionConfig.newBuilder()
                    .setConfig(RecognitionConfig.newBuilder()
                            .setLanguageCode(getDefaultLanguageCode())
                            .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
                            .setSampleRateHertz(sampleRate)
                            .build())
                    .setInterimResults(true)
                    .setSingleUtterance(true)
                    .build())
            .build());
}
/**
 * Recognizes the speech audio. This method should be called every time a chunk of byte buffer
 * is ready.
 *
 * @param data The audio data.
 * @param size The number of elements that are actually relevant in the {@code data}.
 */
public void recognize(byte[] data, int size) {
    if (mRequestObserver == null) {
        return;
    }
    // Call the streaming recognition API
    mRequestObserver.onNext(StreamingRecognizeRequest.newBuilder()
            .setAudioContent(ByteString.copyFrom(data, 0, size))
            .build());
}
/**
 * Finishes recognizing speech audio.
 */
public void finishRecognizing() {
    if (mRequestObserver == null) {
        return;
    }
    mRequestObserver.onCompleted();
    mRequestObserver = null;
}

public interface Listener {

    /**
     * Called when a new piece of text was recognized by the Speech API.
     *
     * @param text    The text.
     * @param isFinal {@code true} when the API finished processing audio.
     */
    void onSpeechRecognized(String text, boolean isFinal);

    void onErrorRecognizing();
    void onRandomStupidity();

}

/**
 * Authenticates the gRPC channel using the specified {@link GoogleCredentials}.
 */
private static class GoogleCredentialsInterceptor implements ClientInterceptor {

    private final Credentials mCredentials;

    private Metadata mCached;

    private Map<String, List<String>> mLastMetadata;

    GoogleCredentialsInterceptor(Credentials credentials) {
        mCredentials = credentials;
    }

    private static Metadata toHeaders(Map<String, List<String>> metadata) {
        Metadata headers = new Metadata();
        if (metadata != null) {
            for (String key : metadata.keySet()) {
                Metadata.Key<String> headerKey = Metadata.Key.of(
                        key, Metadata.ASCII_STRING_MARSHALLER);
                for (String value : metadata.get(key)) {
                    headers.put(headerKey, value);
                }
            }
        }
        return headers;
    }

    @Override
    public <ReqT, RespT> ClientCall<ReqT, RespT> interceptCall(
            final MethodDescriptor<ReqT, RespT> method, CallOptions callOptions,
            final Channel next) {
        return new ClientInterceptors.CheckedForwardingClientCall<ReqT, RespT>(
                next.newCall(method, callOptions)) {
            @Override
            protected void checkedStart(Listener<RespT> responseListener, Metadata headers)
                    throws StatusException {
                Metadata cachedSaved;
                URI uri = serviceUri(next, method);
                synchronized (this) {
                    Map<String, List<String>> latestMetadata = getRequestMetadata(uri);
                    if (mLastMetadata == null || mLastMetadata != latestMetadata) {
                        mLastMetadata = latestMetadata;
                        mCached = toHeaders(mLastMetadata);
                    }
                    cachedSaved = mCached;
                }
                headers.merge(cachedSaved);
                delegate().start(responseListener, headers);
            }
        };
    }

    /**
     * Generate a JWT-specific service URI. The URI is simply an identifier with enough
     * information for a service to know that the JWT was intended for it. The URI will
     * commonly be verified with a simple string equality check.
     */
    private URI serviceUri(Channel channel, MethodDescriptor<?, ?> method)
            throws StatusException {
        String authority = channel.authority();
        if (authority == null) {
            throw Status.UNAUTHENTICATED
                    .withDescription("Channel has no authority")
                    .asException();
        }
        // Always use HTTPS, by definition.
        final String scheme = "https";
        final int defaultPort = 443;
        String path = "/" + MethodDescriptor.extractFullServiceName(method.getFullMethodName());
        URI uri;
        try {
            uri = new URI(scheme, authority, path, null, null);
        } catch (URISyntaxException e) {
            throw Status.UNAUTHENTICATED
                    .withDescription("Unable to construct service URI for auth")
                    .withCause(e).asException();
        }
        // The default port must not be present. Alternative ports should be present.
        if (uri.getPort() == defaultPort) {
            uri = removePort(uri);
        }
        return uri;
    }

    private URI removePort(URI uri) throws StatusException {
        try {
            return new URI(uri.getScheme(), uri.getUserInfo(), uri.getHost(), -1 /* port */,
                    uri.getPath(), uri.getQuery(), uri.getFragment());
        } catch (URISyntaxException e) {
            throw Status.UNAUTHENTICATED
                    .withDescription("Unable to construct service URI after removing port")
                    .withCause(e).asException();
        }
    }

    private Map<String, List<String>> getRequestMetadata(URI uri) throws StatusException {
        try {
            return mCredentials.getRequestMetadata(uri);
        } catch (IOException e) {
            throw Status.UNAUTHENTICATED.withCause(e).asException();
        }
    }

}

private class SpeechBinder extends Binder {

    SpeechService getService() {
        return SpeechService.this;
    }

}

private class CreateApiSingle implements SingleOnSubscribe<SpeechGrpc.SpeechStub> {

    @Override
    public void subscribe(SingleEmitter<SpeechGrpc.SpeechStub> emitter) throws Exception {
        final AccessToken accessToken = generateCredentials();
        final SpeechGrpc.SpeechStub api = generateApi(accessToken);

        emitter.onSuccess(api);
    }

    private AccessToken generateCredentials() throws IOException {
        final SharedPreferences prefs =
                getSharedPreferences(PREFS, Context.MODE_PRIVATE);
        String tokenValue = prefs.getString(PREF_ACCESS_TOKEN_VALUE, null);
        long expirationTime = prefs.getLong(PREF_ACCESS_TOKEN_EXPIRATION_TIME, -1);

        // Check if the current token is still valid for a while
        if (tokenValue != null && expirationTime > 0) {
            if (expirationTime
                    > System.currentTimeMillis() + ACCESS_TOKEN_EXPIRATION_TOLERANCE) {
                return new AccessToken(tokenValue, new Date(expirationTime));
            }
        }

        // ***** WARNING *****
        // In this sample, we load the credential from a JSON file stored in a raw resource
        // folder of this client app. You should never do this in your app. Instead, store
        // the file in your server and obtain an access token from there.
        // *******************
        final InputStream stream = getResources().openRawResource(R.raw.credential);
        final GoogleCredentials credentials = GoogleCredentials.fromStream(stream)
                .createScoped(SCOPE);
        final AccessToken token = credentials.refreshAccessToken();
        prefs.edit()
                .putString(PREF_ACCESS_TOKEN_VALUE, token.getTokenValue())
                .putLong(PREF_ACCESS_TOKEN_EXPIRATION_TIME,
                        token.getExpirationTime().getTime())
                .apply();

        stream.close();
        return token;
    }

    private SpeechGrpc.SpeechStub generateApi(AccessToken accessToken) {
        final ManagedChannel channel = new OkHttpChannelProvider()
                .builderForAddress(HOSTNAME, PORT)
                .nameResolverFactory(new DnsNameResolverProvider())
                .intercept(new GoogleCredentialsInterceptor(new GoogleCredentials(accessToken)
                        .createScoped(SCOPE)))
                .build();
        return SpeechGrpc.newStub(channel);
    }

}

private class AccessTokenTask extends AsyncTask<Void, Void, AccessToken> {

    @Override
    protected AccessToken doInBackground(Void... voids) {
        final SharedPreferences prefs =
                getSharedPreferences(PREFS, Context.MODE_PRIVATE);
        String tokenValue = prefs.getString(PREF_ACCESS_TOKEN_VALUE, null);
        long expirationTime = prefs.getLong(PREF_ACCESS_TOKEN_EXPIRATION_TIME, -1);

        // Check if the current token is still valid for a while
        if (tokenValue != null && expirationTime > 0) {
            if (expirationTime
                    > System.currentTimeMillis() + ACCESS_TOKEN_EXPIRATION_TOLERANCE) {
                return new AccessToken(tokenValue, new Date(expirationTime));
            }
        }

        // ***** WARNING *****
        // In this sample, we load the credential from a JSON file stored in a raw resource
        // folder of this client app. You should never do this in your app. Instead, store
        // the file in your server and obtain an access token from there.
        // *******************
        final InputStream stream = getResources().openRawResource(R.raw.credential);
        try {
            final GoogleCredentials credentials = GoogleCredentials.fromStream(stream)
                    .createScoped(SCOPE);
            final AccessToken token = credentials.refreshAccessToken();
            prefs.edit()
                    .putString(PREF_ACCESS_TOKEN_VALUE, token.getTokenValue())
                    .putLong(PREF_ACCESS_TOKEN_EXPIRATION_TIME,
                            token.getExpirationTime().getTime())
                    .apply();
            return token;
        } catch (IOException e) {
            Log.e(TAG, "Failed to obtain access token.", e);
        }
        return null;
    }

    @Override
    protected void onPostExecute(AccessToken accessToken) {
        mAccessTokenTask = null;
        final ManagedChannel channel = new OkHttpChannelProvider()
                .builderForAddress(HOSTNAME, PORT)
                .nameResolverFactory(new DnsNameResolverProvider())
                .intercept(new GoogleCredentialsInterceptor(new GoogleCredentials(accessToken)
                        .createScoped(SCOPE)))
                .build();
        mApi = SpeechGrpc.newStub(channel);

        // Schedule access token refresh before it expires
        if (mHandler != null) {
            mHandler.postDelayed(mFetchAccessTokenRunnable,
                    Math.max(accessToken.getExpirationTime().getTime()
                            - System.currentTimeMillis()
                            - ACCESS_TOKEN_FETCH_MARGIN, ACCESS_TOKEN_EXPIRATION_TOLERANCE));
        }
    }
}}





public class VoiceRecorder {

private static final int[] SAMPLE_RATE_CANDIDATES = new int[]{48000, 44100};

private static final int CHANNEL = AudioFormat.CHANNEL_IN_MONO;
private static final int ENCODING = AudioFormat.ENCODING_PCM_16BIT;

private static final int AMPLITUDE_THRESHOLD = 1500;
private static final int SPEECH_TIMEOUT_MILLIS = 2000;
private static final int MAX_SPEECH_LENGTH_MILLIS = 30 * 1000;

public static abstract class Callback {

    /**
     * Called when the recorder starts hearing voice.
     */
    public void onVoiceStart() {
    }

    /**
     * Called when the recorder is hearing voice.
     *
     * @param data The audio data in {@link AudioFormat#ENCODING_PCM_16BIT}.
     * @param size The size of the actual data in {@code data}.
     */
    public void onVoice(byte[] data, int size) {
    }

    /**
     * Called when the recorder stops hearing voice.
     */
    public void onVoiceEnd() {
    }
}

private final Callback mCallback;

private AudioRecord mAudioRecord;

private Thread mThread;

private byte[] mBuffer;

private final Object mLock = new Object();

/** The timestamp of the last time that voice is heard. */
private long mLastVoiceHeardMillis = Long.MAX_VALUE;

/** The timestamp when the current voice is started. */
private long mVoiceStartedMillis;

public VoiceRecorder(@NonNull Callback callback) {
    mCallback = callback;
}

/**
 * Starts recording audio.
 *
 * <p>The caller is responsible for calling {@link #stop()} later.</p>
 */
public void start() {
    // Stop recording if it is currently ongoing.
    stop();
    // Try to create a new recording session.
    mAudioRecord = createAudioRecord();
    if (mAudioRecord == null) {
        throw new RuntimeException("Cannot instantiate VoiceRecorder");
    }
    // Start recording.
    mAudioRecord.startRecording();
    // Start processing the captured audio.
    mThread = new Thread(new ProcessVoice());
    mThread.start();
}

/**
 * Stops recording audio.
 */
public void stop() {
    synchronized (mLock) {
        System.out.println("stop audio record....");
        dismiss();
        if (mThread != null) {
            mThread.interrupt();
            mThread = null;
        }
        if (mAudioRecord != null) {
            mAudioRecord.stop();
            mAudioRecord.release();
            mAudioRecord = null;
        }
        mBuffer = null;
        System.out.println("stop audio record....2");
    }
}

/**
 * Dismisses the currently ongoing utterance.
 */
public void dismiss() {
    if (mLastVoiceHeardMillis != Long.MAX_VALUE) {
        mLastVoiceHeardMillis = Long.MAX_VALUE;
        mCallback.onVoiceEnd();
    }
}

/**
 * Retrieves the sample rate currently used to record audio.
 *
 * @return The sample rate of recorded audio.
 */
public int getSampleRate() {
    if (mAudioRecord != null) {
        return mAudioRecord.getSampleRate();
    }
    return 0;
}

/**
 * Creates a new {@link AudioRecord}.
 *
 * @return A newly created {@link AudioRecord}, or null if it cannot be created (missing
 * permissions?).
 */
private AudioRecord createAudioRecord() {
    for (int sampleRate : SAMPLE_RATE_CANDIDATES) {
        final int sizeInBytes = AudioRecord.getMinBufferSize(sampleRate, CHANNEL, ENCODING);
        if (sizeInBytes == AudioRecord.ERROR_BAD_VALUE) {
            continue;
        }
        final AudioRecord audioRecord = new AudioRecord(MediaRecorder.AudioSource.MIC,
                sampleRate, CHANNEL, ENCODING, sizeInBytes);
        if (audioRecord.getState() == AudioRecord.STATE_INITIALIZED) {
            mBuffer = new byte[sizeInBytes];
            return audioRecord;
        } else {
            audioRecord.release();
        }
    }
    return null;
}

/**
 * Continuously processes the captured audio and notifies {@link #mCallback} of corresponding
 * events.
 */
private class ProcessVoice implements Runnable {

    @Override
    public void run() {
        while (true) {
            synchronized (mLock) {
                if (Thread.currentThread().isInterrupted()) {
                    break;
                }
                final int size = mAudioRecord.read(mBuffer, 0, mBuffer.length);
                final long now = System.currentTimeMillis();
                if (isHearingVoice(mBuffer, size)) {
                    if (mLastVoiceHeardMillis == Long.MAX_VALUE) {
                        mVoiceStartedMillis = now;
                        mCallback.onVoiceStart();
                    }
                    mCallback.onVoice(mBuffer, size);
                    mLastVoiceHeardMillis = now;
                    if (now - mVoiceStartedMillis > MAX_SPEECH_LENGTH_MILLIS) {
                        end();
                    }
                } else if (mLastVoiceHeardMillis != Long.MAX_VALUE) {
                    mCallback.onVoice(mBuffer, size);
                    if (now - mLastVoiceHeardMillis > SPEECH_TIMEOUT_MILLIS) {
                        end();
                    }
                }
            }
        }
    }

    private void end() {
        mLastVoiceHeardMillis = Long.MAX_VALUE;
        mCallback.onVoiceEnd();

        System.out.println("end...");

    }

    private boolean isHearingVoice(byte[] buffer, int size) {
        for (int i = 0; i < size - 1; i += 2) {
            // The buffer has LINEAR16 in little endian.
            int s = buffer[i + 1];
            if (s < 0) s *= -1;
            s <<= 8;
            s += Math.abs(buffer[i]);
            if (s > AMPLITUDE_THRESHOLD) {
                return true;
            }
        }
        return false;
    }
}}

然后我实现了语音服务和录音机回调,如下所示:

private VoiceRecorder voiceRecorder;
private final SpeechService.Listener speechServiceListener = new SpeechService.Listener() {

    @Override
    public void onSpeechRecognized(final String text, final boolean isFinal) {

        if (isFinal) {
         System.out.println("ui thread...");

            if (!TextUtils.isEmpty(text)) {
                runOnUiThread(() -> {

                    showMessage(text);
                    flingAnswer(text);
                });
            }
        }

    }

    @Override
    public void onErrorRecognizing() {
        showMessage("Please try again. Could not detect.");
    }

    @Override
    public void onRandomStupidity() {

    }

};

private SpeechService speechService;
private final VoiceRecorder.Callback voiceCallback = new VoiceRecorder.Callback() {

    @Override
    public void onVoiceStart() {
        if (speechService != null) {
            System.out.println("voice start....");


            speechService.startRecognizing(voiceRecorder.getSampleRate());
        }
    }

    @Override
    public void onVoice(byte[] data, int size) {
        if (speechService != null) {
            speechService.recognize(data, size);
        }
    }

    @Override
    public void onVoiceEnd() {
        if (speechService != null) {
            speechService.finishRecognizing();
        }
    }

};
private final ServiceConnection serviceConnection = new ServiceConnection() {

    @Override
    public void onServiceConnected(ComponentName componentName, IBinder binder) {
        speechService = SpeechService.from(binder);
        speechService.addListener(speechServiceListener);
    }

    @Override
    public void onServiceDisconnected(ComponentName componentName) {
        speechService = null;
    }

};

对于语音输入,代码如下:

@Override
public void stopRecognizing() {
    stopVoiceRecorder();

    Log.e("Recording", "Stopped");
}


@Override
public void startRecognizing() {
    if (permissionManager != null && permissionManager.askForPermissions()) {
        startVoiceRecorder();
        vibrate.vibrate(50);//Providing haptic feedback to user on press.

    }
    Log.e("Recording", "Started");
}

binding.imgVoice.setOnTouchListener((v, event) -> {
                switch (event.getAction()) {
                    case MotionEvent.ACTION_UP:

                        System.out.println("up...");
                        mCallback.stopRecognizing();


                                binding.imgVoice
                                        .animate()
                                        .scaleX(1.0f)
                                        .scaleY(1.0f);
                                binding.imgVoice.setVisibility(View.GONE);
                                binding.progressBar.setVisibility(View.VISIBLE);


                        break;
                    case MotionEvent.ACTION_DOWN:

                        System.out.println("down...");


                        binding.imgVoice
                                .animate()
                                .scaleX(1.8f)
                                .scaleY(1.8f);
                        mCallback.startRecognizing();

                        break;

                }

                return true;
            });

        }

当我按下麦克风时,事件注册为 Action_Down,我启动录音机并在释放麦克风时停止录音机。此外,使用 Action_Down 我正在放大需要在 Action_Up 上缩小的麦克风图标。但是ui在大多数情况下都会冻结。我发现 StreamObserver 的onNext() 回调在isFinal变为真之前不断被调用。

 private void startVoiceRecorder() {
    if (voiceRecorder != null) {
        voiceRecorder.stop();
    }
    voiceRecorder = new VoiceRecorder(voiceCallback);
    voiceRecorder.start();
}

private void stopVoiceRecorder() {
    if (voiceRecorder != null) {
        voiceRecorder.stop();
        voiceRecorder = null;
    }
}

但是我希望麦克风在我释放没有发生的麦克风(在 Action up 事件中)后立即缩小。那么,如果有人可以帮助我解决这个问题?

提前致谢。

标签: androidmultithreadingspeech-to-textmotioneventgrpc-java

解决方案


推荐阅读