CarPlay反复连接30次左右后连不上
背景
最近收到一个Bug,carplay在反复连接30次左右就连不上了;具体现象就是,USB能连上,iAP2鉴权也能通过,但就是建立不起CarPlay Session连接,没办法,只有一行一行添加打印了;
调试
首先CarPlay是Apple_PlugIn这边调用bonjour来发现CarPlay服务的,然后在Apple_Plugin这边添加了很多打印,发现和mdns端通信的socket已经连上了,但在连接30次左右始终没来数据,猜猜是不是bonjour那边出了问题。
先保存一份正常连接情况下的bonjour打印,注意红框地方。
在看出错打打印,
可以看到正常连接情况下,数据是会走ncm0接口的(这是一个IPV6的网络接口),但出错情况下就没有走ncm0,而是走的lo,这是Linux下默认的回环网卡,通过这个接口是肯定不能正常发送网络包给iPhone的。
分析
可以看到正常情况下是会调用SetupInterfaceList这个函数的,由于对bonjour代码不熟,只能硬啃了,猜测这个函数应该就是在某种情况会更新网络接口链表,而出错的情况下没有更新链表,所以才导致连不上,那就看这个函数是哪里调用就可以了。
分析可以看到在程序启动初始化时会调用一次
mDNSexport mStatus mDNSPlatformInit(mDNS *const m)
{
//...
// Tell mDNS core about the network interfaces on this machine.
if (err == mStatus_NoError) err = SetupInterfaceList(m);
}
然后就是下面这个函数也会调用。
mDNSexport mStatus mDNSPlatformPosixRefreshInterfaceList(mDNS *const m)
{
int err;
// This is a pretty heavyweight way to process interface changes --
// destroying the entire interface list and then making fresh one from scratch.
// We should make it like the OS X version, which leaves unchanged interfaces alone.
ClearInterfaceList(m);
err = SetupInterfaceList(m);
return PosixErrorToStatus(err);
}
再看下这个函数是哪里调用的,有几个地方调用了,但下面这个函数最可疑。
mDNSlocal void InterfaceChangeCallback(int fd, short filter, void *context)
{
//...
mDNSu32 changedInterfaces = 0;
do
{
changedInterfaces |= ProcessRoutingNotification(pChgRec->NotifySD);
LogInfo("lqy changedInterfaces = %d\n",changedInterfaces);
}
while (0 < select(pChgRec->NotifySD + 1, &readFDs, (fd_set*) NULL, (fd_set*) NULL, &zeroTimeout));
// Currently we rebuild the entire interface list whenever any interface change is
// detected. If this ever proves to be a performance issue in a multi-homed
// configuration, more care should be paid to changedInterfaces.
if (changedInterfaces)
mDNSPlatformPosixRefreshInterfaceList(pChgRec->mDNS);
}
经过反复测试加打印发现,当不能连接的时候changedInterfaces是等于0的,if语句后面的自然不会执行,而且拔掉重连也不行,发现这个变量始终是0.关键在这行代码:
changedInterfaces |= ProcessRoutingNotification(pChgRec->NotifySD);
那就进去看看它做了什么
mDNSlocal mDNSu32 ProcessRoutingNotification(int sd)
// Read through the messages on sd and if any indicate that any interface records should
// be torn down and rebuilt, return affected indices as a bitmask. Otherwise return 0.
{
ssize_t readCount;
char buff[4096];
struct nlmsghdr *pNLMsg = (struct nlmsghdr*) buff;
mDNSu32 result = 0;
// The structure here is more complex than it really ought to be because,
// unfortunately, there's no good way to size a buffer in advance large
// enough to hold all pending data and so avoid message fragmentation.
// (Note that FIONREAD is not supported on AF_NETLINK.)
readCount = read(sd, buff, sizeof buff);
while (1)
{
// Make sure we've got an entire nlmsghdr in the buffer, and payload, too.
// If not, discard already-processed messages in buffer and read more data.
if (((char*) &pNLMsg[1] > (buff + readCount)) || // i.e. *pNLMsg extends off end of buffer
((char*) pNLMsg + pNLMsg->nlmsg_len > (buff + readCount)))
{
if (buff < (char*) pNLMsg) // we have space to shuffle
{
// discard processed data
readCount -= ((char*) pNLMsg - buff);
memmove(buff, pNLMsg, readCount);
pNLMsg = (struct nlmsghdr*) buff;
// read more data
readCount += read(sd, buff + readCount, sizeof buff - readCount);
continue; // spin around and revalidate with new readCount
}
else
break; // Otherwise message does not fit in buffer
}
#if MDNS_DEBUGMSGS
PrintNetLinkMsg(pNLMsg);
#endif
// Process the NetLink message
if (pNLMsg->nlmsg_type == RTM_GETLINK || pNLMsg->nlmsg_type == RTM_NEWLINK)
{
if(((struct ifinfomsg*) NLMSG_DATA(pNLMsg))->ifi_index >= 20)
{
result |= 1 << 2;
LogInfo("lqy result(%d), ifi_index(%d), line(%d)", result,((struct ifinfomsg*) NLMSG_DATA(pNLMsg))->ifi_index, __LINE__);
}
else
{
result |= 1 << ((struct ifinfomsg*) NLMSG_DATA(pNLMsg))->ifi_index;
LogInfo("lqy result(%d), ifi_index(%d), line(%d)", result,((struct ifinfomsg*) NLMSG_DATA(pNLMsg))->ifi_index, __LINE__);
}
}
else if (pNLMsg->nlmsg_type == RTM_DELADDR || pNLMsg->nlmsg_type == RTM_NEWADDR)
{
if(((struct ifaddrmsg*) NLMSG_DATA(pNLMsg))->ifa_index >= 20)
{
result |= 1 << 2;
LogInfo("lqy result(%d), ifi_index(%d), line(%d)", result,((struct ifaddrmsg*) NLMSG_DATA(pNLMsg))->ifa_index, __LINE__);
}
else
{
result |= 1 << ((struct ifaddrmsg*) NLMSG_DATA(pNLMsg))->ifa_index;
LogInfo("lqy result(%d), ifi_index(%d), line(%d)", result,((struct ifaddrmsg*) NLMSG_DATA(pNLMsg))->ifa_index, __LINE__);
}
}
// Advance pNLMsg to the next message in the buffer
if ((pNLMsg->nlmsg_flags & NLM_F_MULTI) != 0 && pNLMsg->nlmsg_type != NLMSG_DONE)
{
ssize_t len = readCount - ((char*)pNLMsg - buff);
pNLMsg = NLMSG_NEXT(pNLMsg, len);
}
else
break; // all done!
}
LogInfo("lqy result = %d, line = %d", result, __LINE__);
return result;
}
实际观察发现,当连不上时((struct ifinfomsg*) **NLMSG_DATA(pNLMsg))->ifi_index是等于32的,且重新插拔这个数会继续增加,再看result,32位的无符号数。
嗯,整型溢出了,修改方法也简单,当这个数超过20的时候,左右2就可以了。
不过话说这中错误真的很难找啊,/(ㄒoㄒ)/~~
参考资料
bonjour版本多少?可以再试试反复连接256次。。。
@niefan 手动插拔测试的, 256次的话../(ㄒoㄒ)/~~
1小时足够了……不行就上脚本